markdown.cpp
Go to the documentation of this file.
1 
2 /*
3  Copyright (c) 2009 by Chad Nelson
4  Released under the MIT License.
5  See the provided LICENSE.TXT file for details.
6 */
7 
8 #include "markdown.h"
9 
10 #include <cassert>
11 #include <sstream>
12 
13 #include <boost/regex.hpp>
14 
15 #include <SimoxUtility/algorithm/string/string_tools.h>
16 
17 #include "markdown-tokens.h"
18 
19 
21 using markdown::TokenPtr;
22 
23 namespace
24 {
25 
26  struct HtmlTagInfo
27  {
28  std::string tagName, extra;
29  bool isClosingTag;
30  size_t lengthOfToken; // In original string
31  };
32 
33  const std::string cHtmlTokenSource(
34  "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))>");
35  const boost::regex cHtmlTokenExpression(cHtmlTokenSource),
36  cStartHtmlTokenExpression("^" + cHtmlTokenSource),
37  cOneHtmlTokenExpression("^" + cHtmlTokenSource + "$");
38 
39  enum ParseHtmlTagFlags
40  {
41  cAlone,
42  cStarts
43  };
44 
45  std::optional<HtmlTagInfo>
46  parseHtmlTag(std::string::const_iterator begin,
47  std::string::const_iterator end,
48  ParseHtmlTagFlags flags)
49  {
50  boost::smatch m;
51 
52  if (boost::regex_search(
53  begin,
54  end,
55  m,
56  (flags == cAlone ? cOneHtmlTokenExpression : cStartHtmlTokenExpression)))
57  {
58  HtmlTagInfo r;
59  r.tagName = m[3];
60 
61  if (m[4].matched)
62  {
63  r.extra = m[4];
64  }
65 
66  r.isClosingTag = (m[2].length() > 0);
67  r.lengthOfToken = m[0].length();
68  return r;
69  }
70 
71  return std::nullopt;
72  }
73 
75  parseInlineHtmlText(const std::string& src)
76  {
78  std::string::const_iterator prev = src.begin(), end = src.end();
79 
80  while (true)
81  {
82  boost::smatch m;
83 
84  if (boost::regex_search(prev, end, m, cHtmlTokenExpression))
85  {
86  if (prev != m[0].first)
87  {
88  //cerr << " Non-tag (" << std::distance(prev, m[0].first) << "): " << std::string(prev, m[0].first) << endl;
89  r.push_back(TokenPtr(
90  new markdown::token::InlineHtmlContents(std::string(prev, m[0].first))));
91  }
92 
93  //cerr << " Tag: " << m[1] << endl;
94  r.push_back(TokenPtr(new markdown::token::HtmlTag(m[1])));
95  prev = m[0].second;
96  }
97  else
98  {
99  std::string eol;
100 
101  if (prev != end)
102  {
103  eol = std::string(prev, end);
104  //cerr << " Non-tag: " << eol << endl;
105  }
106 
107  eol += '\n';
108  r.push_back(TokenPtr(new markdown::token::InlineHtmlContents(eol)));
109  break;
110  }
111  }
112 
113  return r;
114  }
115 
116  bool
117  isHtmlCommentStart(std::string::const_iterator begin, std::string::const_iterator end)
118  {
119  // It can't be a single-line comment, those will already have been parsed
120  // by isBlankLine.
121  static const boost::regex cExpression("^<!--");
122  return boost::regex_search(begin, end, cExpression);
123  }
124 
125  bool
126  isHtmlCommentEnd(std::string::const_iterator begin, std::string::const_iterator end)
127  {
128  static const boost::regex cExpression(".*-- *>$");
129  return boost::regex_match(begin, end, cExpression);
130  }
131 
132  bool
133  isBlankLine(const std::string& line)
134  {
135  static const boost::regex cExpression(" {0,3}(<--(.*)-- *> *)* *");
136  return boost::regex_match(line, cExpression);
137  }
138 
139  std::optional<TokenPtr>
140  parseInlineHtml(CTokenGroupIter& i, CTokenGroupIter end)
141  {
142  // Preconditions: Previous line was blank, or this is the first line.
143  if ((*i)->text())
144  {
145  const std::string line(*(*i)->text());
146 
147  bool tag = false, comment = false;
148  std::optional<HtmlTagInfo> tagInfo = parseHtmlTag(line.begin(), line.end(), cStarts);
149 
150  if (tagInfo && markdown::token::isValidTag(tagInfo->tagName) > 1)
151  {
152  tag = true;
153  }
154  else if (isHtmlCommentStart(line.begin(), line.end()))
155  {
156  comment = true;
157  }
158 
159  if (tag)
160  {
161  // Block continues until an HTML tag (alone) on a line followed by a
162  // blank line.
163  markdown::TokenGroup contents;
164  CTokenGroupIter firstLine = i, prevLine = i;
165  size_t lines = 0;
166 
167  bool done = false;
168 
169  do
170  {
171  // We encode HTML tags so that their contents gets properly
172  // handled -- i.e. "<div style=">"/>" becomes <div style="&gt;"/>
173  if ((*i)->text())
174  {
175  markdown::TokenGroup t = parseInlineHtmlText(*(*i)->text());
176  contents.splice(contents.end(), t);
177  }
178  else
179  {
180  contents.push_back(*i);
181  }
182 
183  prevLine = i;
184  ++i;
185  ++lines;
186 
187  if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
188  {
189  if (prevLine == firstLine)
190  {
191  done = true;
192  }
193  else
194  {
195  const std::string text(*(*prevLine)->text());
196 
197  if (parseHtmlTag(text.begin(), text.end(), cAlone))
198  {
199  done = true;
200  }
201  }
202  }
203  } while (i != end && !done);
204 
205  if (lines > 1 || markdown::token::isValidTag(tagInfo->tagName, true) > 1)
206  {
207  i = prevLine;
208  return TokenPtr(new markdown::token::InlineHtmlBlock(contents));
209  }
210  else
211  {
212  // Single-line HTML "blocks" whose initial tags are span-tags
213  // don't qualify as inline HTML.
214  i = firstLine;
215  return std::nullopt;
216  }
217  }
218  else if (comment)
219  {
220  // Comment continues until a closing tag is found; at present, it
221  // also has to be the last thing on the line, and has to be
222  // immediately followed by a blank line too.
223  markdown::TokenGroup contents;
224  CTokenGroupIter firstLine = i, prevLine = i;
225 
226  bool done = false;
227 
228  do
229  {
230  if ((*i)->text())
231  {
232  contents.push_back(
233  TokenPtr(new markdown::token::InlineHtmlComment(*(*i)->text() + '\n')));
234  }
235  else
236  {
237  contents.push_back(*i);
238  }
239 
240  prevLine = i;
241  ++i;
242 
243  if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
244  {
245  if (prevLine == firstLine)
246  {
247  done = true;
248  }
249  else
250  {
251  const std::string text(*(*prevLine)->text());
252 
253  if (isHtmlCommentEnd(text.begin(), text.end()))
254  {
255  done = true;
256  }
257  }
258  }
259  } while (i != end && !done);
260 
261  i = prevLine;
262  return TokenPtr(new markdown::token::InlineHtmlBlock(contents));
263  }
264  }
265 
266  return std::nullopt;
267  }
268 
269  std::optional<std::string>
270  isCodeBlockLine(CTokenGroupIter& i, CTokenGroupIter end)
271  {
272  if ((*i)->isBlankLine())
273  {
274  // If we get here, we're already in a code block.
275  ++i;
276 
277  if (i != end)
278  {
279  std::optional<std::string> r = isCodeBlockLine(i, end);
280 
281  if (r)
282  {
283  return std::string("\n" + *r);
284  }
285  }
286 
287  --i;
288  }
289  else if ((*i)->text() && (*i)->canContainMarkup())
290  {
291  std::string line(*(*i)->text());
292 
293  if (line.length() >= 4)
294  {
295  std::string::iterator si = line.begin(), sie = si + 4;
296 
297  while (si != sie && *si == ' ')
298  {
299  ++si;
300  }
301 
302  if (si == sie)
303  {
304  ++i;
305  return std::string(si, line.end());
306  }
307  }
308  }
309 
310  return std::nullopt;
311  }
312 
313  std::optional<TokenPtr>
314  parseCodeBlock(CTokenGroupIter& i, CTokenGroupIter end)
315  {
316  if (!(*i)->isBlankLine())
317  {
318  std::optional<std::string> contents = isCodeBlockLine(i, end);
319 
320  if (contents)
321  {
322  std::ostringstream out;
323  out << *contents << '\n';
324 
325  while (i != end)
326  {
327  contents = isCodeBlockLine(i, end);
328 
329  if (contents)
330  {
331  out << *contents << '\n';
332  }
333  else
334  {
335  break;
336  }
337  }
338 
339  return TokenPtr(new markdown::token::CodeBlock(out.str()));
340  }
341  }
342 
343  return std::nullopt;
344  }
345 
346  size_t
347  countQuoteLevel(const std::string& prefixString)
348  {
349  size_t r = 0;
350 
351  for (char qi : prefixString)
352  if (qi == '>')
353  {
354  ++r;
355  }
356 
357  return r;
358  }
359 
360  std::optional<TokenPtr>
361  parseBlockQuote(CTokenGroupIter& i, CTokenGroupIter end)
362  {
363  static const boost::regex cBlockQuoteExpression("^((?: {0,3}>)+) (.*)$");
364  // Useful captures: 1=prefix, 2=content
365 
366  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
367  {
368  const std::string line(*(*i)->text());
369  boost::smatch m;
370 
371  if (boost::regex_match(line, m, cBlockQuoteExpression))
372  {
373  size_t quoteLevel = countQuoteLevel(m[1]);
374  boost::regex continuationExpression =
375  boost::regex("^((?: {0,3}>){" + std::to_string(quoteLevel) + "}) ?(.*)$");
376 
377  markdown::TokenGroup subTokens;
378  subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));
379 
380  // The next line can be a continuation of this quote (with or
381  // without the prefix string) or a blank line. Blank lines are
382  // treated as part of this quote if the following line is a
383  // properly-prefixed quote line too, otherwise they terminate the
384  // quote.
385  ++i;
386 
387  while (i != end)
388  {
389  if ((*i)->isBlankLine())
390  {
391  CTokenGroupIter ii = i;
392  ++ii;
393 
394  if (ii == end)
395  {
396  i = ii;
397  break;
398  }
399  else
400  {
401  const std::string line(*(*ii)->text());
402 
403  if (boost::regex_match(line, m, continuationExpression))
404  {
405  if (m[1].matched && m[1].length() > 0)
406  {
407  i = ++ii;
408  subTokens.push_back(TokenPtr(new markdown::token::BlankLine));
409  subTokens.push_back(
411  }
412  else
413  {
414  break;
415  }
416  }
417  else
418  {
419  break;
420  }
421  }
422  }
423  else
424  {
425  const std::string line(*(*i)->text());
426 
427  if (boost::regex_match(line, m, continuationExpression))
428  {
429  assert(m[2].matched);
430 
431  if (!isBlankLine(m[2]))
432  {
433  subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));
434  }
435  else
436  {
437  subTokens.push_back(TokenPtr(new markdown::token::BlankLine(m[2])));
438  }
439 
440  ++i;
441  }
442  else
443  {
444  break;
445  }
446  }
447  }
448 
449  return TokenPtr(new markdown::token::BlockQuote(subTokens));
450  }
451  }
452 
453  return std::nullopt;
454  }
455 
456  std::optional<TokenPtr>
457  parseListBlock(CTokenGroupIter& i, CTokenGroupIter end, bool sub = false)
458  {
459  static const boost::regex cUnorderedListExpression("^( *)([*+-]) +([^*-].*)$");
460  static const boost::regex cOrderedListExpression("^( *)([0-9]+)\\. +(.*)$");
461 
462  enum ListType
463  {
464  cNone,
465  cUnordered,
466  cOrdered
467  };
468 
469  ListType type = cNone;
470 
471  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
472  {
473  boost::regex nextItemExpression, startSublistExpression;
474  size_t indent = 0;
475 
476  const std::string line((*i)->text().value());
477 
478  //cerr << "IsList? " << line << endl;
479 
480  markdown::TokenGroup subTokens, subItemTokens;
481 
482  boost::smatch m;
483 
484  if (boost::regex_match(line, m, cUnorderedListExpression))
485  {
486  indent = m[1].length();
487 
488  if (sub || indent < 4)
489  {
490  type = cUnordered;
491  char startChar = *m[2].first;
492  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));
493 
494  std::ostringstream next;
495  next << "^" << std::string(indent, ' ') << "\\" << startChar << " +([^*-].*)$";
496  nextItemExpression = next.str();
497  }
498  }
499  else if (boost::regex_match(line, m, cOrderedListExpression))
500  {
501  indent = m[1].length();
502 
503  if (sub || indent < 4)
504  {
505  type = cOrdered;
506  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));
507 
508  std::ostringstream next;
509  next << "^" << std::string(indent, ' ') << "[0-9]+\\. +(.*)$";
510  nextItemExpression = next.str();
511  }
512  }
513 
514  if (type != cNone)
515  {
516  CTokenGroupIter originalI = i;
517  size_t itemCount = 1;
518  std::ostringstream sub;
519  sub << "^" << std::string(indent, ' ') << " +(([*+-])|([0-9]+\\.)) +.*$";
520  startSublistExpression = sub.str();
521 
522  // There are several options for the next line. It's another item in
523  // this list (in which case this one is done); it's a continuation
524  // of this line (collect it and keep going); it's the first item in
525  // a sub-list (call this function recursively to collect it), it's
526  // the next item in the parent list (this one is ended); or it's
527  // blank.
528  //
529  // A blank line requires looking ahead. If the next line is an item
530  // for this list, then switch this list into paragraph-items mode
531  // and continue processing. If it's indented by four or more spaces
532  // (more than the list itself), then it's another continuation of
533  // the current item. Otherwise it's either a new paragraph (and this
534  // list is ended) or the beginning of a sub-list.
535  static const boost::regex cContinuedItemExpression("^ *([^ ].*)$");
536 
537  boost::regex continuedAfterBlankLineExpression("^ {" + std::to_string(indent + 4) +
538  "}([^ ].*)$");
539  boost::regex codeBlockAfterBlankLineExpression("^ {" + std::to_string(indent + 8) +
540  "}(.*)$");
541 
542  enum NextItemType
543  {
544  cUnknown,
545  cEndOfList,
546  cAnotherItem
547  };
548 
549  NextItemType nextItem = cUnknown;
550  bool setParagraphMode = false;
551 
552  ++i;
553 
554  while (i != end)
555  {
556  if ((*i)->isBlankLine())
557  {
558  CTokenGroupIter ii = i;
559  ++ii;
560 
561  if (ii == end)
562  {
563  i = ii;
564  nextItem = cEndOfList;
565  }
566  else if ((*ii)->text())
567  {
568  const std::string line(*(*ii)->text());
569 
570  if (boost::regex_match(line, startSublistExpression))
571  {
572  setParagraphMode = true;
573  ++itemCount;
574  i = ii;
575  std::optional<TokenPtr> p = parseListBlock(i, end, true);
576  assert(p);
577  subItemTokens.push_back(*p);
578  continue;
579  }
580  else if (boost::regex_match(line, m, nextItemExpression))
581  {
582  setParagraphMode = true;
583  i = ii;
584  nextItem = cAnotherItem;
585  }
586  else if (boost::regex_match(line, m, continuedAfterBlankLineExpression))
587  {
588  assert(m[1].matched);
589  subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));
590  subItemTokens.push_back(
592  i = ++ii;
593  continue;
594  }
595  else if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression))
596  {
597  setParagraphMode = true;
598  ++itemCount;
599  assert(m[1].matched);
600  subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));
601 
602  std::string codeBlock = m[1] + '\n';
603  ++ii;
604 
605  while (ii != end)
606  {
607  if ((*ii)->isBlankLine())
608  {
609  CTokenGroupIter iii = ii;
610  ++iii;
611  const std::string nextLine(*(*iii)->text());
612 
613  if (boost::regex_match(
614  nextLine, m, codeBlockAfterBlankLineExpression))
615  {
616  codeBlock += '\n' + m[1] + '\n';
617  ii = iii;
618  }
619  else
620  {
621  break;
622  }
623  }
624  else if ((*ii)->text())
625  {
626  const std::string line(*(*ii)->text());
627 
628  if (boost::regex_match(
629  line, m, codeBlockAfterBlankLineExpression))
630  {
631  codeBlock += m[1] + '\n';
632  }
633  else
634  {
635  break;
636  }
637  }
638  else
639  {
640  break;
641  }
642 
643  ++ii;
644  }
645 
646  subItemTokens.push_back(
647  TokenPtr(new markdown::token::CodeBlock(codeBlock)));
648  i = ii;
649  continue;
650  }
651  else
652  {
653  nextItem = cEndOfList;
654  }
655  }
656  else
657  {
658  break;
659  }
660  }
661  else if ((*i)->text())
662  {
663  const std::string line(*(*i)->text());
664 
665  if (boost::regex_match(line, startSublistExpression))
666  {
667  ++itemCount;
668  std::optional<TokenPtr> p = parseListBlock(i, end, true);
669  assert(p);
670  subItemTokens.push_back(*p);
671  continue;
672  }
673  else if (boost::regex_match(line, m, nextItemExpression))
674  {
675  nextItem = cAnotherItem;
676  }
677  else
678  {
679  if (boost::regex_match(line, m, cUnorderedListExpression) ||
680  boost::regex_match(line, m, cOrderedListExpression))
681  {
682  // Belongs to the parent list
683  nextItem = cEndOfList;
684  }
685  else
686  {
687  boost::regex_match(line, m, cContinuedItemExpression);
688  assert(m[1].matched);
689  subItemTokens.push_back(
691  ++i;
692  continue;
693  }
694  }
695  }
696  else
697  {
698  nextItem = cEndOfList;
699  }
700 
701  if (!subItemTokens.empty())
702  {
703  subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));
704  subItemTokens.clear();
705  }
706 
707  assert(nextItem != cUnknown);
708 
709  if (nextItem == cAnotherItem)
710  {
711  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));
712  ++itemCount;
713  ++i;
714  }
715  else // nextItem==cEndOfList
716  {
717  break;
718  }
719  }
720 
721  // In case we hit the end with an unterminated item...
722  if (!subItemTokens.empty())
723  {
724  subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));
725  subItemTokens.clear();
726  }
727 
728  if (itemCount > 1 || indent != 0)
729  {
730  if (type == cUnordered)
731  {
732  return TokenPtr(
733  new markdown::token::UnorderedList(subTokens, setParagraphMode));
734  }
735  else
736  {
737  return TokenPtr(
738  new markdown::token::OrderedList(subTokens, setParagraphMode));
739  }
740  }
741  else
742  {
743  // It looked like a list, but turned out to be a false alarm.
744  i = originalI;
745  return std::nullopt;
746  }
747  }
748  }
749 
750  return std::nullopt;
751  }
752 
753  bool
754  parseReference(CTokenGroupIter& i, CTokenGroupIter end, markdown::LinkIds& idTable)
755  {
756  if ((*i)->text())
757  {
758  static const boost::regex cReference(
759  "^ {0,3}\\[(.+)\\]: +<?([^ >]+)>?(?: *(?:('|\")(.*)\\3)|(?:\\((.*)\\)))?$");
760  // Useful captures: 1=id, 2=url, 4/5=title
761 
762  const std::string line1(*(*i)->text());
763  boost::smatch m;
764 
765  if (boost::regex_match(line1, m, cReference))
766  {
767  std::string id(m[1]), url(m[2]), title;
768 
769  if (m[4].matched)
770  {
771  title = m[4];
772  }
773  else if (m[5].matched)
774  {
775  title = m[5];
776  }
777  else
778  {
779  CTokenGroupIter ii = i;
780  ++ii;
781 
782  if (ii != end && (*ii)->text())
783  {
784  // It could be on the next line
785  static const boost::regex cSeparateTitle(
786  "^ *(?:(?:('|\")(.*)\\1)|(?:\\((.*)\\))) *$");
787  // Useful Captures: 2/3=title
788 
789  const std::string line2(*(*ii)->text());
790 
791  if (boost::regex_match(line2, m, cSeparateTitle))
792  {
793  ++i;
794  title = (m[2].matched ? m[2] : m[3]);
795  }
796  }
797  }
798 
799  idTable.add(id, url, title);
800  return true;
801  }
802  }
803 
804  return false;
805  }
806 
807  void
808  flushParagraph(std::string& paragraphText,
809  markdown::TokenGroup& paragraphTokens,
810  markdown::TokenGroup& finalTokens,
811  bool noParagraphs)
812  {
813  if (!paragraphText.empty())
814  {
815  paragraphTokens.push_back(TokenPtr(new markdown::token::RawText(paragraphText)));
816  paragraphText.clear();
817  }
818 
819  if (!paragraphTokens.empty())
820  {
821  if (noParagraphs)
822  {
823  if (paragraphTokens.size() > 1)
824  {
825  finalTokens.push_back(
826  TokenPtr(new markdown::token::Container(paragraphTokens)));
827  }
828  else
829  {
830  finalTokens.push_back(*paragraphTokens.begin());
831  }
832  }
833  else
834  {
835  finalTokens.push_back(TokenPtr(new markdown::token::Paragraph(paragraphTokens)));
836  }
837 
838  paragraphTokens.clear();
839  }
840  }
841 
842  std::optional<TokenPtr>
843  parseHeader(CTokenGroupIter& i, CTokenGroupIter end)
844  {
845  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
846  {
847  // Hash-mark type
848  static const boost::regex cHashHeaders("^(#{1,6}) +(.*?) *#*$");
849  const std::string line = *(*i)->text();
850  boost::smatch m;
851 
852  if (boost::regex_match(line, m, cHashHeaders))
853  {
854  return TokenPtr(new markdown::token::Header(m[1].length(), m[2]));
855  }
856 
857  // Underlined type
858  CTokenGroupIter ii = i;
859  ++ii;
860 
861  if (ii != end && !(*ii)->isBlankLine() && (*ii)->text() && (*ii)->canContainMarkup())
862  {
863  static const boost::regex cUnderlinedHeaders("^([-=])\\1*$");
864  const std::string line = *(*ii)->text();
865 
866  if (boost::regex_match(line, m, cUnderlinedHeaders))
867  {
868  char typeChar = std::string(m[1])[0];
869  TokenPtr p = TokenPtr(
870  new markdown::token::Header((typeChar == '=' ? 1 : 2), *(*i)->text()));
871  i = ii;
872  return p;
873  }
874  }
875  }
876 
877  return std::nullopt;
878  }
879 
880  std::optional<TokenPtr>
881  parseHorizontalRule(CTokenGroupIter& i, CTokenGroupIter end)
882  {
883  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
884  {
885  static const boost::regex cHorizontalRules("^ {0,3}((?:-|\\*|_) *){3,}$");
886  const std::string line = *(*i)->text();
887 
888  if (boost::regex_match(line, cHorizontalRules))
889  {
890  return TokenPtr(new markdown::token::HtmlTag("hr/"));
891  }
892  }
893 
894  return std::nullopt;
895  }
896 
897 } // namespace
898 
899 namespace markdown
900 {
901 
902  std::optional<LinkIds::Target>
903  LinkIds::find(const std::string& id) const
904  {
905  Table::const_iterator i = mTable.find(_scrubKey(id));
906 
907  if (i != mTable.end())
908  {
909  return i->second;
910  }
911  else
912  {
913  return std::nullopt;
914  }
915  }
916 
917  void
918  LinkIds::add(const std::string& id, const std::string& url, const std::string& title)
919  {
920  mTable.insert(std::make_pair(_scrubKey(id), Target(url, title)));
921  }
922 
923  std::string
924  LinkIds::_scrubKey(std::string str)
925  {
926  return simox::alg::to_lower(str);
927  }
928 
929  const size_t Document::cSpacesPerInitialTab = 4; // Required by Markdown format
930  const size_t Document::cDefaultSpacesPerTab = cSpacesPerInitialTab;
931 
932  Document::Document(size_t spacesPerTab) :
933  cSpacesPerTab(spacesPerTab),
934  mTokenContainer(new token::Container),
935  mIdTable(new LinkIds),
936  mProcessed(false)
937  {
938  // This space deliberately blank ;-)
939  }
940 
941  Document::Document(std::istream& in, size_t spacesPerTab) :
942  cSpacesPerTab(spacesPerTab),
943  mTokenContainer(new token::Container),
944  mIdTable(new LinkIds),
945  mProcessed(false)
946  {
947  read(in);
948  }
949 
951  {
952  delete mIdTable;
953  }
954 
955  bool
956  Document::read(const std::string& src)
957  {
958  std::istringstream in(src);
959  return read(in);
960  }
961 
962  bool
963  Document::_getline(std::istream& in, std::string& line)
964  {
965  // Handles \n, \r, and \r\n (and even \n\r) on any system. Also does tab-
966  // expansion, since this is the most efficient place for it.
967  line.clear();
968 
969  bool initialWhitespace = true;
970  char c;
971 
972  while (in.get(c))
973  {
974  if (c == '\r')
975  {
976  if ((in.get(c)) && c != '\n')
977  {
978  in.unget();
979  }
980 
981  return true;
982  }
983  else if (c == '\n')
984  {
985  if ((in.get(c)) && c != '\r')
986  {
987  in.unget();
988  }
989 
990  return true;
991  }
992  else if (c == '\t')
993  {
994  size_t convert = (initialWhitespace ? cSpacesPerInitialTab : cSpacesPerTab);
995  line += std::string(convert - (line.length() % convert), ' ');
996  }
997  else
998  {
999  line.push_back(c);
1000 
1001  if (c != ' ')
1002  {
1003  initialWhitespace = false;
1004  }
1005  }
1006  }
1007 
1008  return !line.empty();
1009  }
1010 
1011  bool
1012  Document::read(std::istream& in)
1013  {
1014  if (mProcessed)
1015  {
1016  return false;
1017  }
1018 
1019  token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1020  assert(tokens != 0);
1021 
1022  std::string line;
1023  TokenGroup tgt;
1024 
1025  while (_getline(in, line))
1026  {
1027  if (isBlankLine(line))
1028  {
1029  tgt.push_back(TokenPtr(new token::BlankLine(line)));
1030  }
1031  else
1032  {
1033  tgt.push_back(TokenPtr(new token::RawText(line)));
1034  }
1035  }
1036 
1037  tokens->appendSubtokens(tgt);
1038 
1039  return true;
1040  }
1041 
1042  void
1043  Document::write(std::ostream& out)
1044  {
1045  _process();
1046  mTokenContainer->writeAsHtml(out);
1047  }
1048 
1049  void
1050  Document::writeTokens(std::ostream& out)
1051  {
1052  _process();
1053  mTokenContainer->writeToken(0, out);
1054  }
1055 
1056  void
1057  Document::_process()
1058  {
1059  if (!mProcessed)
1060  {
1061  _mergeMultilineHtmlTags();
1062  _processInlineHtmlAndReferences();
1063  _processBlocksItems(mTokenContainer);
1064  _processParagraphLines(mTokenContainer);
1065  mTokenContainer->processSpanElements(*mIdTable);
1066  mProcessed = true;
1067  }
1068  }
1069 
1070  void
1071  Document::_mergeMultilineHtmlTags()
1072  {
1073  static const boost::regex cHtmlTokenStart(
1074  "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))$");
1075  static const boost::regex cHtmlTokenEnd(
1076  "^ *((?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\3))*? */? *))>");
1077 
1078  TokenGroup processed;
1079 
1080  token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1081  assert(tokens != 0);
1082 
1083  for (TokenGroup::const_iterator i = tokens->subTokens().begin(),
1084  ie = tokens->subTokens().end();
1085  i != ie;
1086  ++i)
1087  {
1088  if ((*i)->text() && boost::regex_match(*(*i)->text(), cHtmlTokenStart))
1089  {
1090  TokenGroup::const_iterator i2 = i;
1091  ++i2;
1092 
1093  if (i2 != tokens->subTokens().end() && (*i2)->text() &&
1094  boost::regex_match(*(*i2)->text(), cHtmlTokenEnd))
1095  {
1096  processed.push_back(TokenPtr(
1097  new markdown::token::RawText(*(*i)->text() + ' ' + *(*i2)->text())));
1098  ++i;
1099  continue;
1100  }
1101  }
1102 
1103  processed.push_back(*i);
1104  }
1105 
1106  tokens->swapSubtokens(processed);
1107  }
1108 
1109  void
1110  Document::_processInlineHtmlAndReferences()
1111  {
1112  TokenGroup processed;
1113 
1114  token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1115  assert(tokens != 0);
1116 
1117  for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1118  iie = tokens->subTokens().end();
1119  ii != iie;
1120  ++ii)
1121  {
1122  if ((*ii)->text())
1123  {
1124  if (processed.empty() || processed.back()->isBlankLine())
1125  {
1126  std::optional<TokenPtr> inlineHtml = parseInlineHtml(ii, iie);
1127 
1128  if (inlineHtml)
1129  {
1130  processed.push_back(*inlineHtml);
1131 
1132  if (ii == iie)
1133  {
1134  break;
1135  }
1136 
1137  continue;
1138  }
1139  }
1140 
1141  if (parseReference(ii, iie, *mIdTable))
1142  {
1143  if (ii == iie)
1144  {
1145  break;
1146  }
1147 
1148  continue;
1149  }
1150 
1151  // If it gets down here, just store it in its current (raw text)
1152  // form. We'll group the raw text lines into paragraphs in a
1153  // later pass, since we can't easily tell where paragraphs
1154  // end until then.
1155  }
1156 
1157  processed.push_back(*ii);
1158  }
1159 
1160  tokens->swapSubtokens(processed);
1161  }
1162 
1163  void
1164  Document::_processBlocksItems(TokenPtr inTokenContainer)
1165  {
1166  if (!inTokenContainer->isContainer())
1167  {
1168  return;
1169  }
1170 
1171  token::Container* tokens = dynamic_cast<token::Container*>(inTokenContainer.get());
1172  assert(tokens != 0);
1173 
1174  TokenGroup processed;
1175 
1176  for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1177  iie = tokens->subTokens().end();
1178  ii != iie;
1179  ++ii)
1180  {
1181  if ((*ii)->text())
1182  {
1183  std::optional<TokenPtr> subitem;
1184 
1185  if (!subitem)
1186  {
1187  subitem = parseHeader(ii, iie);
1188  }
1189 
1190  if (!subitem)
1191  {
1192  subitem = parseHorizontalRule(ii, iie);
1193  }
1194 
1195  if (!subitem)
1196  {
1197  subitem = parseListBlock(ii, iie);
1198  }
1199 
1200  if (!subitem)
1201  {
1202  subitem = parseBlockQuote(ii, iie);
1203  }
1204 
1205  if (!subitem)
1206  {
1207  subitem = parseCodeBlock(ii, iie);
1208  }
1209 
1210  if (subitem)
1211  {
1212  _processBlocksItems(*subitem);
1213  processed.push_back(*subitem);
1214 
1215  if (ii == iie)
1216  {
1217  break;
1218  }
1219 
1220  continue;
1221  }
1222  else
1223  {
1224  processed.push_back(*ii);
1225  }
1226  }
1227  else if ((*ii)->isContainer())
1228  {
1229  _processBlocksItems(*ii);
1230  processed.push_back(*ii);
1231  }
1232  }
1233 
1234  tokens->swapSubtokens(processed);
1235  }
1236 
1237  void
1238  Document::_processParagraphLines(TokenPtr inTokenContainer)
1239  {
1240  token::Container* tokens = dynamic_cast<token::Container*>(inTokenContainer.get());
1241  assert(tokens != 0);
1242 
1243  bool noPara = tokens->inhibitParagraphs();
1244 
1245  for (const auto& ii : tokens->subTokens())
1246  if (ii->isContainer())
1247  {
1248  _processParagraphLines(ii);
1249  }
1250 
1251  TokenGroup processed;
1252  std::string paragraphText;
1253  TokenGroup paragraphTokens;
1254 
1255  for (const auto& ii : tokens->subTokens())
1256  {
1257  if (ii->text() && ii->canContainMarkup() && !ii->inhibitParagraphs())
1258  {
1259  static const boost::regex cExpression("^(.*) $");
1260 
1261  if (!paragraphText.empty())
1262  {
1263  paragraphText += " ";
1264  }
1265 
1266  boost::smatch m;
1267 
1268  if (boost::regex_match(*ii->text(), m, cExpression))
1269  {
1270  paragraphText += m[1];
1271  flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1272  processed.push_back(TokenPtr(new markdown::token::HtmlTag("br/")));
1273  }
1274  else
1275  {
1276  paragraphText += *ii->text();
1277  }
1278  }
1279  else
1280  {
1281  flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1282  processed.push_back(ii);
1283  }
1284  }
1285 
1286  // Make sure the last paragraph is properly flushed too.
1287  flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1288 
1289  tokens->swapSubtokens(processed);
1290  }
1291 
1292 } // namespace markdown
armarx::navigation::platform_controller::platform_global_trajectory::Target
Twist2D Target
Definition: PlatformGlobalTrajectoryController.h:82
markdown::token::Header
Definition: markdown-tokens.h:329
str
std::string str(const T &t)
Definition: UserAssistedSegmenterGuiWidgetController.cpp:43
markdown::token::InlineHtmlContents
Definition: markdown-tokens.h:265
markdown::Document::writeTokens
void writeTokens(std::ostream &)
Definition: markdown.cpp:1050
markdown.h
c
constexpr T c
Definition: UnscentedKalmanFilterTest.cpp:46
markdown::token::UnorderedList
Definition: markdown-tokens.h:568
armarx::detail::StreamPrinterTag::tag
@ tag
markdown::token::isValidTag
size_t isValidTag(const std::string &tag, bool nonBlockFirst)
Definition: markdown-tokens.cpp:323
convert
void convert(const std::filesystem::path &in, const std::filesystem::path &out, bool print_progress)
Performs the actual conversion.
Definition: main.cpp:170
httplib::detail::case_ignore::to_lower
unsigned char to_lower(int c)
Definition: httplib.h:346
markdown::token::Container
Definition: markdown-tokens.h:416
markdown::Document::read
bool read(const std::string &)
Definition: markdown.cpp:956
markdown::LinkIds::add
void add(const std::string &id, const std::string &url, const std::string &title)
Definition: markdown.cpp:918
markdown::TokenPtr
std::shared_ptr< Token > TokenPtr
Definition: markdown.h:21
markdown::Document::Document
Document(size_t spacesPerTab=cDefaultSpacesPerTab)
Definition: markdown.cpp:932
markdown::Document::~Document
~Document()
Definition: markdown.cpp:950
markdown::token::RawText
Definition: markdown-tokens.h:189
markdown::token::BlankLine
Definition: markdown-tokens.h:366
markdown
Definition: markdown-tokens.cpp:16
markdown::Document::write
void write(std::ostream &)
Definition: markdown.cpp:1043
markdown::token::InlineHtmlComment
Definition: markdown-tokens.h:280
armarx::control::hardware_config::tagName
std::string tagName(ConfigTag tag)
Definition: Config.cpp:301
markdown::LinkIds
Definition: markdown-tokens.h:21
markdown::CTokenGroupIter
TokenGroup::const_iterator CTokenGroupIter
Definition: markdown-tokens.h:19
markdown::token::ListItem
Definition: markdown-tokens.h:520
armarx::to_string
const std::string & to_string(const std::string &s)
Definition: StringHelpers.h:41
markdown::token::OrderedList
Definition: markdown-tokens.h:599
markdown::token::Paragraph
Definition: markdown-tokens.h:666
markdown::token::CodeBlock
Definition: markdown-tokens.h:294
markdown-tokens.h
markdown::token::BlockQuote
Definition: markdown-tokens.h:633
sub
Point sub(const Point &x, const Point &y)
Definition: point.hpp:46
markdown::TokenGroup
std::list< TokenPtr > TokenGroup
Definition: markdown.h:22
markdown::token::InlineHtmlBlock
Definition: markdown-tokens.h:477
markdown::LinkIds::find
std::optional< Target > find(const std::string &id) const
Definition: markdown.cpp:903
markdown::token::Container::appendSubtokens
void appendSubtokens(TokenGroup &tokens)
Definition: markdown-tokens.h:431
markdown::token::HtmlTag
Definition: markdown-tokens.h:226