markdown.cpp
Go to the documentation of this file.
1 
2 /*
3  Copyright (c) 2009 by Chad Nelson
4  Released under the MIT License.
5  See the provided LICENSE.TXT file for details.
6 */
7 
8 #include "markdown.h"
9 #include "markdown-tokens.h"
10 
11 #include <SimoxUtility/algorithm/string/string_tools.h>
12 
13 #include <boost/regex.hpp>
14 
15 #include <sstream>
16 #include <cassert>
17 
18 
19 using markdown::TokenPtr;
21 
22 namespace
23 {
24 
25  struct HtmlTagInfo
26  {
27  std::string tagName, extra;
28  bool isClosingTag;
29  size_t lengthOfToken; // In original string
30  };
31 
32  const std::string cHtmlTokenSource("<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))>");
33  const boost::regex cHtmlTokenExpression(cHtmlTokenSource),
34  cStartHtmlTokenExpression("^" + cHtmlTokenSource),
35  cOneHtmlTokenExpression("^" + cHtmlTokenSource + "$");
36 
37  enum ParseHtmlTagFlags { cAlone, cStarts };
38 
39  std::optional<HtmlTagInfo> parseHtmlTag(std::string::const_iterator begin,
40  std::string::const_iterator end, ParseHtmlTagFlags flags)
41  {
42  boost::smatch m;
43 
44  if (boost::regex_search(begin, end, m, (flags == cAlone ?
45  cOneHtmlTokenExpression : cStartHtmlTokenExpression)))
46  {
47  HtmlTagInfo r;
48  r.tagName = m[3];
49 
50  if (m[4].matched)
51  {
52  r.extra = m[4];
53  }
54 
55  r.isClosingTag = (m[2].length() > 0);
56  r.lengthOfToken = m[0].length();
57  return r;
58  }
59 
60  return std::nullopt;
61  }
62 
63  markdown::TokenGroup parseInlineHtmlText(const std::string& src)
64  {
66  std::string::const_iterator prev = src.begin(), end = src.end();
67 
68  while (true)
69  {
70  boost::smatch m;
71 
72  if (boost::regex_search(prev, end, m, cHtmlTokenExpression))
73  {
74  if (prev != m[0].first)
75  {
76  //cerr << " Non-tag (" << std::distance(prev, m[0].first) << "): " << std::string(prev, m[0].first) << endl;
77  r.push_back(TokenPtr(new markdown::token::InlineHtmlContents(std::string(prev, m[0].first))));
78  }
79 
80  //cerr << " Tag: " << m[1] << endl;
81  r.push_back(TokenPtr(new markdown::token::HtmlTag(m[1])));
82  prev = m[0].second;
83  }
84  else
85  {
86  std::string eol;
87 
88  if (prev != end)
89  {
90  eol = std::string(prev, end);
91  //cerr << " Non-tag: " << eol << endl;
92  }
93 
94  eol += '\n';
95  r.push_back(TokenPtr(new markdown::token::InlineHtmlContents(eol)));
96  break;
97  }
98  }
99 
100  return r;
101  }
102 
103  bool isHtmlCommentStart(std::string::const_iterator begin,
104  std::string::const_iterator end)
105  {
106  // It can't be a single-line comment, those will already have been parsed
107  // by isBlankLine.
108  static const boost::regex cExpression("^<!--");
109  return boost::regex_search(begin, end, cExpression);
110  }
111 
112  bool isHtmlCommentEnd(std::string::const_iterator begin,
113  std::string::const_iterator end)
114  {
115  static const boost::regex cExpression(".*-- *>$");
116  return boost::regex_match(begin, end, cExpression);
117  }
118 
119  bool isBlankLine(const std::string& line)
120  {
121  static const boost::regex cExpression(" {0,3}(<--(.*)-- *> *)* *");
122  return boost::regex_match(line, cExpression);
123  }
124 
125  std::optional<TokenPtr> parseInlineHtml(CTokenGroupIter& i, CTokenGroupIter end)
126  {
127  // Preconditions: Previous line was blank, or this is the first line.
128  if ((*i)->text())
129  {
130  const std::string line(*(*i)->text());
131 
132  bool tag = false, comment = false;
133  std::optional<HtmlTagInfo> tagInfo = parseHtmlTag(line.begin(), line.end(), cStarts);
134 
135  if (tagInfo && markdown::token::isValidTag(tagInfo->tagName) > 1)
136  {
137  tag = true;
138  }
139  else if (isHtmlCommentStart(line.begin(), line.end()))
140  {
141  comment = true;
142  }
143 
144  if (tag)
145  {
146  // Block continues until an HTML tag (alone) on a line followed by a
147  // blank line.
148  markdown::TokenGroup contents;
149  CTokenGroupIter firstLine = i, prevLine = i;
150  size_t lines = 0;
151 
152  bool done = false;
153 
154  do
155  {
156  // We encode HTML tags so that their contents gets properly
157  // handled -- i.e. "<div style=">"/>" becomes <div style="&gt;"/>
158  if ((*i)->text())
159  {
160  markdown::TokenGroup t = parseInlineHtmlText(*(*i)->text());
161  contents.splice(contents.end(), t);
162  }
163  else
164  {
165  contents.push_back(*i);
166  }
167 
168  prevLine = i;
169  ++i;
170  ++lines;
171 
172  if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
173  {
174  if (prevLine == firstLine)
175  {
176  done = true;
177  }
178  else
179  {
180  const std::string text(*(*prevLine)->text());
181 
182  if (parseHtmlTag(text.begin(), text.end(), cAlone))
183  {
184  done = true;
185  }
186  }
187  }
188  }
189  while (i != end && !done);
190 
191  if (lines > 1 || markdown::token::isValidTag(tagInfo->tagName, true) > 1)
192  {
193  i = prevLine;
194  return TokenPtr(new markdown::token::InlineHtmlBlock(contents));
195  }
196  else
197  {
198  // Single-line HTML "blocks" whose initial tags are span-tags
199  // don't qualify as inline HTML.
200  i = firstLine;
201  return std::nullopt;
202  }
203  }
204  else if (comment)
205  {
206  // Comment continues until a closing tag is found; at present, it
207  // also has to be the last thing on the line, and has to be
208  // immediately followed by a blank line too.
209  markdown::TokenGroup contents;
210  CTokenGroupIter firstLine = i, prevLine = i;
211 
212  bool done = false;
213 
214  do
215  {
216  if ((*i)->text())
217  {
218  contents.push_back(TokenPtr(new markdown::token::InlineHtmlComment(*(*i)->text() + '\n')));
219  }
220  else
221  {
222  contents.push_back(*i);
223  }
224 
225  prevLine = i;
226  ++i;
227 
228  if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
229  {
230  if (prevLine == firstLine)
231  {
232  done = true;
233  }
234  else
235  {
236  const std::string text(*(*prevLine)->text());
237 
238  if (isHtmlCommentEnd(text.begin(), text.end()))
239  {
240  done = true;
241  }
242  }
243  }
244  }
245  while (i != end && !done);
246 
247  i = prevLine;
248  return TokenPtr(new markdown::token::InlineHtmlBlock(contents));
249  }
250  }
251 
252  return std::nullopt;
253  }
254 
255  std::optional<std::string> isCodeBlockLine(CTokenGroupIter& i, CTokenGroupIter end)
256  {
257  if ((*i)->isBlankLine())
258  {
259  // If we get here, we're already in a code block.
260  ++i;
261 
262  if (i != end)
263  {
264  std::optional<std::string> r = isCodeBlockLine(i, end);
265 
266  if (r)
267  {
268  return std::string("\n" + *r);
269  }
270  }
271 
272  --i;
273  }
274  else if ((*i)->text() && (*i)->canContainMarkup())
275  {
276  std::string line(*(*i)->text());
277 
278  if (line.length() >= 4)
279  {
280  std::string::iterator si = line.begin(), sie = si + 4;
281 
282  while (si != sie && *si == ' ')
283  {
284  ++si;
285  }
286 
287  if (si == sie)
288  {
289  ++i;
290  return std::string(si, line.end());
291  }
292  }
293  }
294 
295  return std::nullopt;
296  }
297 
298  std::optional<TokenPtr> parseCodeBlock(CTokenGroupIter& i, CTokenGroupIter end)
299  {
300  if (!(*i)->isBlankLine())
301  {
302  std::optional<std::string> contents = isCodeBlockLine(i, end);
303 
304  if (contents)
305  {
306  std::ostringstream out;
307  out << *contents << '\n';
308 
309  while (i != end)
310  {
311  contents = isCodeBlockLine(i, end);
312 
313  if (contents)
314  {
315  out << *contents << '\n';
316  }
317  else
318  {
319  break;
320  }
321  }
322 
323  return TokenPtr(new markdown::token::CodeBlock(out.str()));
324  }
325  }
326 
327  return std::nullopt;
328  }
329 
330 
331 
332  size_t countQuoteLevel(const std::string& prefixString)
333  {
334  size_t r = 0;
335 
336  for (char qi : prefixString)
337  if (qi == '>')
338  {
339  ++r;
340  }
341 
342  return r;
343  }
344 
345  std::optional<TokenPtr> parseBlockQuote(CTokenGroupIter& i, CTokenGroupIter end)
346  {
347  static const boost::regex cBlockQuoteExpression("^((?: {0,3}>)+) (.*)$");
348  // Useful captures: 1=prefix, 2=content
349 
350  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
351  {
352  const std::string line(*(*i)->text());
353  boost::smatch m;
354 
355  if (boost::regex_match(line, m, cBlockQuoteExpression))
356  {
357  size_t quoteLevel = countQuoteLevel(m[1]);
358  boost::regex continuationExpression = boost::regex("^((?: {0,3}>){" + std::to_string(quoteLevel) + "}) ?(.*)$");
359 
360  markdown::TokenGroup subTokens;
361  subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));
362 
363  // The next line can be a continuation of this quote (with or
364  // without the prefix string) or a blank line. Blank lines are
365  // treated as part of this quote if the following line is a
366  // properly-prefixed quote line too, otherwise they terminate the
367  // quote.
368  ++i;
369 
370  while (i != end)
371  {
372  if ((*i)->isBlankLine())
373  {
374  CTokenGroupIter ii = i;
375  ++ii;
376 
377  if (ii == end)
378  {
379  i = ii;
380  break;
381  }
382  else
383  {
384  const std::string line(*(*ii)->text());
385 
386  if (boost::regex_match(line, m, continuationExpression))
387  {
388  if (m[1].matched && m[1].length() > 0)
389  {
390  i = ++ii;
391  subTokens.push_back(TokenPtr(new markdown::token::BlankLine));
392  subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));
393  }
394  else
395  {
396  break;
397  }
398  }
399  else
400  {
401  break;
402  }
403  }
404  }
405  else
406  {
407  const std::string line(*(*i)->text());
408 
409  if (boost::regex_match(line, m, continuationExpression))
410  {
411  assert(m[2].matched);
412 
413  if (!isBlankLine(m[2]))
414  {
415  subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));
416  }
417  else
418  {
419  subTokens.push_back(TokenPtr(new markdown::token::BlankLine(m[2])));
420  }
421 
422  ++i;
423  }
424  else
425  {
426  break;
427  }
428  }
429  }
430 
431  return TokenPtr(new markdown::token::BlockQuote(subTokens));
432  }
433  }
434 
435  return std::nullopt;
436  }
437 
438  std::optional<TokenPtr> parseListBlock(CTokenGroupIter& i, CTokenGroupIter end, bool sub = false)
439  {
440  static const boost::regex cUnorderedListExpression("^( *)([*+-]) +([^*-].*)$");
441  static const boost::regex cOrderedListExpression("^( *)([0-9]+)\\. +(.*)$");
442 
443  enum ListType { cNone, cUnordered, cOrdered };
444  ListType type = cNone;
445 
446  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
447  {
448  boost::regex nextItemExpression, startSublistExpression;
449  size_t indent = 0;
450 
451  const std::string line((*i)->text().value());
452 
453  //cerr << "IsList? " << line << endl;
454 
455  markdown::TokenGroup subTokens, subItemTokens;
456 
457  boost::smatch m;
458 
459  if (boost::regex_match(line, m, cUnorderedListExpression))
460  {
461  indent = m[1].length();
462 
463  if (sub || indent < 4)
464  {
465  type = cUnordered;
466  char startChar = *m[2].first;
467  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));
468 
469  std::ostringstream next;
470  next << "^" << std::string(indent, ' ') << "\\" << startChar << " +([^*-].*)$";
471  nextItemExpression = next.str();
472  }
473  }
474  else if (boost::regex_match(line, m, cOrderedListExpression))
475  {
476  indent = m[1].length();
477 
478  if (sub || indent < 4)
479  {
480  type = cOrdered;
481  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));
482 
483  std::ostringstream next;
484  next << "^" << std::string(indent, ' ') << "[0-9]+\\. +(.*)$";
485  nextItemExpression = next.str();
486  }
487  }
488 
489  if (type != cNone)
490  {
491  CTokenGroupIter originalI = i;
492  size_t itemCount = 1;
493  std::ostringstream sub;
494  sub << "^" << std::string(indent, ' ') << " +(([*+-])|([0-9]+\\.)) +.*$";
495  startSublistExpression = sub.str();
496 
497  // There are several options for the next line. It's another item in
498  // this list (in which case this one is done); it's a continuation
499  // of this line (collect it and keep going); it's the first item in
500  // a sub-list (call this function recursively to collect it), it's
501  // the next item in the parent list (this one is ended); or it's
502  // blank.
503  //
504  // A blank line requires looking ahead. If the next line is an item
505  // for this list, then switch this list into paragraph-items mode
506  // and continue processing. If it's indented by four or more spaces
507  // (more than the list itself), then it's another continuation of
508  // the current item. Otherwise it's either a new paragraph (and this
509  // list is ended) or the beginning of a sub-list.
510  static const boost::regex cContinuedItemExpression("^ *([^ ].*)$");
511 
512  boost::regex continuedAfterBlankLineExpression("^ {" +
513  std::to_string(indent + 4) + "}([^ ].*)$");
514  boost::regex codeBlockAfterBlankLineExpression("^ {" +
515  std::to_string(indent + 8) + "}(.*)$");
516 
517  enum NextItemType { cUnknown, cEndOfList, cAnotherItem };
518  NextItemType nextItem = cUnknown;
519  bool setParagraphMode = false;
520 
521  ++i;
522 
523  while (i != end)
524  {
525  if ((*i)->isBlankLine())
526  {
527  CTokenGroupIter ii = i;
528  ++ii;
529 
530  if (ii == end)
531  {
532  i = ii;
533  nextItem = cEndOfList;
534  }
535  else if ((*ii)->text())
536  {
537  const std::string line(*(*ii)->text());
538 
539  if (boost::regex_match(line, startSublistExpression))
540  {
541  setParagraphMode = true;
542  ++itemCount;
543  i = ii;
544  std::optional<TokenPtr> p = parseListBlock(i, end, true);
545  assert(p);
546  subItemTokens.push_back(*p);
547  continue;
548  }
549  else if (boost::regex_match(line, m, nextItemExpression))
550  {
551  setParagraphMode = true;
552  i = ii;
553  nextItem = cAnotherItem;
554  }
555  else if (boost::regex_match(line, m, continuedAfterBlankLineExpression))
556  {
557  assert(m[1].matched);
558  subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));
559  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));
560  i = ++ii;
561  continue;
562  }
563  else if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression))
564  {
565  setParagraphMode = true;
566  ++itemCount;
567  assert(m[1].matched);
568  subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));
569 
570  std::string codeBlock = m[1] + '\n';
571  ++ii;
572 
573  while (ii != end)
574  {
575  if ((*ii)->isBlankLine())
576  {
577  CTokenGroupIter iii = ii;
578  ++iii;
579  const std::string nextLine(*(*iii)->text());
580 
581  if (boost::regex_match(nextLine, m, codeBlockAfterBlankLineExpression))
582  {
583  codeBlock += '\n' + m[1] + '\n';
584  ii = iii;
585  }
586  else
587  {
588  break;
589  }
590  }
591  else if ((*ii)->text())
592  {
593  const std::string line(*(*ii)->text());
594 
595  if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression))
596  {
597  codeBlock += m[1] + '\n';
598  }
599  else
600  {
601  break;
602  }
603  }
604  else
605  {
606  break;
607  }
608 
609  ++ii;
610  }
611 
612  subItemTokens.push_back(TokenPtr(new markdown::token::CodeBlock(codeBlock)));
613  i = ii;
614  continue;
615  }
616  else
617  {
618  nextItem = cEndOfList;
619  }
620  }
621  else
622  {
623  break;
624  }
625  }
626  else if ((*i)->text())
627  {
628  const std::string line(*(*i)->text());
629 
630  if (boost::regex_match(line, startSublistExpression))
631  {
632  ++itemCount;
633  std::optional<TokenPtr> p = parseListBlock(i, end, true);
634  assert(p);
635  subItemTokens.push_back(*p);
636  continue;
637  }
638  else if (boost::regex_match(line, m, nextItemExpression))
639  {
640  nextItem = cAnotherItem;
641  }
642  else
643  {
644  if (boost::regex_match(line, m, cUnorderedListExpression)
645  || boost::regex_match(line, m, cOrderedListExpression))
646  {
647  // Belongs to the parent list
648  nextItem = cEndOfList;
649  }
650  else
651  {
652  boost::regex_match(line, m, cContinuedItemExpression);
653  assert(m[1].matched);
654  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));
655  ++i;
656  continue;
657  }
658  }
659  }
660  else
661  {
662  nextItem = cEndOfList;
663  }
664 
665  if (!subItemTokens.empty())
666  {
667  subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));
668  subItemTokens.clear();
669  }
670 
671  assert(nextItem != cUnknown);
672 
673  if (nextItem == cAnotherItem)
674  {
675  subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));
676  ++itemCount;
677  ++i;
678  }
679  else // nextItem==cEndOfList
680  {
681  break;
682  }
683  }
684 
685  // In case we hit the end with an unterminated item...
686  if (!subItemTokens.empty())
687  {
688  subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));
689  subItemTokens.clear();
690  }
691 
692  if (itemCount > 1 || indent != 0)
693  {
694  if (type == cUnordered)
695  {
696  return TokenPtr(new markdown::token::UnorderedList(subTokens, setParagraphMode));
697  }
698  else
699  {
700  return TokenPtr(new markdown::token::OrderedList(subTokens, setParagraphMode));
701  }
702  }
703  else
704  {
705  // It looked like a list, but turned out to be a false alarm.
706  i = originalI;
707  return std::nullopt;
708  }
709  }
710  }
711 
712  return std::nullopt;
713  }
714 
715  bool parseReference(CTokenGroupIter& i, CTokenGroupIter end, markdown::LinkIds& idTable)
716  {
717  if ((*i)->text())
718  {
719  static const boost::regex cReference("^ {0,3}\\[(.+)\\]: +<?([^ >]+)>?(?: *(?:('|\")(.*)\\3)|(?:\\((.*)\\)))?$");
720  // Useful captures: 1=id, 2=url, 4/5=title
721 
722  const std::string line1(*(*i)->text());
723  boost::smatch m;
724 
725  if (boost::regex_match(line1, m, cReference))
726  {
727  std::string id(m[1]), url(m[2]), title;
728 
729  if (m[4].matched)
730  {
731  title = m[4];
732  }
733  else if (m[5].matched)
734  {
735  title = m[5];
736  }
737  else
738  {
739  CTokenGroupIter ii = i;
740  ++ii;
741 
742  if (ii != end && (*ii)->text())
743  {
744  // It could be on the next line
745  static const boost::regex cSeparateTitle("^ *(?:(?:('|\")(.*)\\1)|(?:\\((.*)\\))) *$");
746  // Useful Captures: 2/3=title
747 
748  const std::string line2(*(*ii)->text());
749 
750  if (boost::regex_match(line2, m, cSeparateTitle))
751  {
752  ++i;
753  title = (m[2].matched ? m[2] : m[3]);
754  }
755  }
756  }
757 
758  idTable.add(id, url, title);
759  return true;
760  }
761  }
762 
763  return false;
764  }
765 
766  void flushParagraph(std::string& paragraphText, markdown::TokenGroup&
767  paragraphTokens, markdown::TokenGroup& finalTokens, bool noParagraphs)
768  {
769  if (!paragraphText.empty())
770  {
771  paragraphTokens.push_back(TokenPtr(new markdown::token::RawText(paragraphText)));
772  paragraphText.clear();
773  }
774 
775  if (!paragraphTokens.empty())
776  {
777  if (noParagraphs)
778  {
779  if (paragraphTokens.size() > 1)
780  {
781  finalTokens.push_back(TokenPtr(new markdown::token::Container(paragraphTokens)));
782  }
783  else
784  {
785  finalTokens.push_back(*paragraphTokens.begin());
786  }
787  }
788  else
789  {
790  finalTokens.push_back(TokenPtr(new markdown::token::Paragraph(paragraphTokens)));
791  }
792 
793  paragraphTokens.clear();
794  }
795  }
796 
797  std::optional<TokenPtr> parseHeader(CTokenGroupIter& i, CTokenGroupIter end)
798  {
799  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
800  {
801  // Hash-mark type
802  static const boost::regex cHashHeaders("^(#{1,6}) +(.*?) *#*$");
803  const std::string line = *(*i)->text();
804  boost::smatch m;
805 
806  if (boost::regex_match(line, m, cHashHeaders))
807  {
808  return TokenPtr(new markdown::token::Header(m[1].length(), m[2]));
809  }
810 
811  // Underlined type
812  CTokenGroupIter ii = i;
813  ++ii;
814 
815  if (ii != end && !(*ii)->isBlankLine() && (*ii)->text() && (*ii)->canContainMarkup())
816  {
817  static const boost::regex cUnderlinedHeaders("^([-=])\\1*$");
818  const std::string line = *(*ii)->text();
819 
820  if (boost::regex_match(line, m, cUnderlinedHeaders))
821  {
822  char typeChar = std::string(m[1])[0];
823  TokenPtr p = TokenPtr(new markdown::token::Header((typeChar == '='
824  ? 1 : 2), *(*i)->text()));
825  i = ii;
826  return p;
827  }
828  }
829  }
830 
831  return std::nullopt;
832  }
833 
834  std::optional<TokenPtr> parseHorizontalRule(CTokenGroupIter& i, CTokenGroupIter end)
835  {
836  if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
837  {
838  static const boost::regex cHorizontalRules("^ {0,3}((?:-|\\*|_) *){3,}$");
839  const std::string line = *(*i)->text();
840 
841  if (boost::regex_match(line, cHorizontalRules))
842  {
843  return TokenPtr(new markdown::token::HtmlTag("hr/"));
844  }
845  }
846 
847  return std::nullopt;
848  }
849 
850 } // namespace
851 
852 
853 
854 namespace markdown
855 {
856 
857  std::optional<LinkIds::Target> LinkIds::find(const std::string& id) const
858  {
859  Table::const_iterator i = mTable.find(_scrubKey(id));
860 
861  if (i != mTable.end())
862  {
863  return i->second;
864  }
865  else
866  {
867  return std::nullopt;
868  }
869  }
870 
871  void LinkIds::add(const std::string& id, const std::string& url, const
872  std::string& title)
873  {
874  mTable.insert(std::make_pair(_scrubKey(id), Target(url, title)));
875  }
876 
877  std::string LinkIds::_scrubKey(std::string str)
878  {
879  return simox::alg::to_lower(str);
880  }
881 
882 
883 
884  const size_t Document::cSpacesPerInitialTab = 4; // Required by Markdown format
885  const size_t Document::cDefaultSpacesPerTab = cSpacesPerInitialTab;
886 
887  Document::Document(size_t spacesPerTab): cSpacesPerTab(spacesPerTab),
888  mTokenContainer(new token::Container), mIdTable(new LinkIds),
889  mProcessed(false)
890  {
891  // This space deliberately blank ;-)
892  }
893 
894  Document::Document(std::istream& in, size_t spacesPerTab):
895  cSpacesPerTab(spacesPerTab), mTokenContainer(new token::Container),
896  mIdTable(new LinkIds), mProcessed(false)
897  {
898  read(in);
899  }
900 
902  {
903  delete mIdTable;
904  }
905 
906  bool Document::read(const std::string& src)
907  {
908  std::istringstream in(src);
909  return read(in);
910  }
911 
912  bool Document::_getline(std::istream& in, std::string& line)
913  {
914  // Handles \n, \r, and \r\n (and even \n\r) on any system. Also does tab-
915  // expansion, since this is the most efficient place for it.
916  line.clear();
917 
918  bool initialWhitespace = true;
919  char c;
920 
921  while (in.get(c))
922  {
923  if (c == '\r')
924  {
925  if ((in.get(c)) && c != '\n')
926  {
927  in.unget();
928  }
929 
930  return true;
931  }
932  else if (c == '\n')
933  {
934  if ((in.get(c)) && c != '\r')
935  {
936  in.unget();
937  }
938 
939  return true;
940  }
941  else if (c == '\t')
942  {
943  size_t convert = (initialWhitespace ? cSpacesPerInitialTab :
944  cSpacesPerTab);
945  line += std::string(convert - (line.length() % convert), ' ');
946  }
947  else
948  {
949  line.push_back(c);
950 
951  if (c != ' ')
952  {
953  initialWhitespace = false;
954  }
955  }
956  }
957 
958  return !line.empty();
959  }
960 
961  bool Document::read(std::istream& in)
962  {
963  if (mProcessed)
964  {
965  return false;
966  }
967 
968  token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
969  assert(tokens != 0);
970 
971  std::string line;
972  TokenGroup tgt;
973 
974  while (_getline(in, line))
975  {
976  if (isBlankLine(line))
977  {
978  tgt.push_back(TokenPtr(new token::BlankLine(line)));
979  }
980  else
981  {
982  tgt.push_back(TokenPtr(new token::RawText(line)));
983  }
984  }
985 
986  tokens->appendSubtokens(tgt);
987 
988  return true;
989  }
990 
991  void Document::write(std::ostream& out)
992  {
993  _process();
994  mTokenContainer->writeAsHtml(out);
995  }
996 
997  void Document::writeTokens(std::ostream& out)
998  {
999  _process();
1000  mTokenContainer->writeToken(0, out);
1001  }
1002 
1003  void Document::_process()
1004  {
1005  if (!mProcessed)
1006  {
1007  _mergeMultilineHtmlTags();
1008  _processInlineHtmlAndReferences();
1009  _processBlocksItems(mTokenContainer);
1010  _processParagraphLines(mTokenContainer);
1011  mTokenContainer->processSpanElements(*mIdTable);
1012  mProcessed = true;
1013  }
1014  }
1015 
1016  void Document::_mergeMultilineHtmlTags()
1017  {
1018  static const boost::regex cHtmlTokenStart("<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))$");
1019  static const boost::regex cHtmlTokenEnd("^ *((?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\3))*? */? *))>");
1020 
1021  TokenGroup processed;
1022 
1023  token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1024  assert(tokens != 0);
1025 
1026  for (TokenGroup::const_iterator i = tokens->subTokens().begin(),
1027  ie = tokens->subTokens().end(); i != ie; ++i)
1028  {
1029  if ((*i)->text() && boost::regex_match(*(*i)->text(), cHtmlTokenStart))
1030  {
1031  TokenGroup::const_iterator i2 = i;
1032  ++i2;
1033 
1034  if (i2 != tokens->subTokens().end() && (*i2)->text() &&
1035  boost::regex_match(*(*i2)->text(), cHtmlTokenEnd))
1036  {
1037  processed.push_back(TokenPtr(new markdown::token::RawText(*(*i)->text() + ' ' + * (*i2)->text())));
1038  ++i;
1039  continue;
1040  }
1041  }
1042 
1043  processed.push_back(*i);
1044  }
1045 
1046  tokens->swapSubtokens(processed);
1047  }
1048 
1049  void Document::_processInlineHtmlAndReferences()
1050  {
1051  TokenGroup processed;
1052 
1053  token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1054  assert(tokens != 0);
1055 
1056  for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1057  iie = tokens->subTokens().end(); ii != iie; ++ii)
1058  {
1059  if ((*ii)->text())
1060  {
1061  if (processed.empty() || processed.back()->isBlankLine())
1062  {
1063  std::optional<TokenPtr> inlineHtml = parseInlineHtml(ii, iie);
1064 
1065  if (inlineHtml)
1066  {
1067  processed.push_back(*inlineHtml);
1068 
1069  if (ii == iie)
1070  {
1071  break;
1072  }
1073 
1074  continue;
1075  }
1076  }
1077 
1078  if (parseReference(ii, iie, *mIdTable))
1079  {
1080  if (ii == iie)
1081  {
1082  break;
1083  }
1084 
1085  continue;
1086  }
1087 
1088  // If it gets down here, just store it in its current (raw text)
1089  // form. We'll group the raw text lines into paragraphs in a
1090  // later pass, since we can't easily tell where paragraphs
1091  // end until then.
1092  }
1093 
1094  processed.push_back(*ii);
1095  }
1096 
1097  tokens->swapSubtokens(processed);
1098  }
1099 
1100  void Document::_processBlocksItems(TokenPtr inTokenContainer)
1101  {
1102  if (!inTokenContainer->isContainer())
1103  {
1104  return;
1105  }
1106 
1107  token::Container* tokens = dynamic_cast<token::Container*>(inTokenContainer.get());
1108  assert(tokens != 0);
1109 
1110  TokenGroup processed;
1111 
1112  for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1113  iie = tokens->subTokens().end(); ii != iie; ++ii)
1114  {
1115  if ((*ii)->text())
1116  {
1117  std::optional<TokenPtr> subitem;
1118 
1119  if (!subitem)
1120  {
1121  subitem = parseHeader(ii, iie);
1122  }
1123 
1124  if (!subitem)
1125  {
1126  subitem = parseHorizontalRule(ii, iie);
1127  }
1128 
1129  if (!subitem)
1130  {
1131  subitem = parseListBlock(ii, iie);
1132  }
1133 
1134  if (!subitem)
1135  {
1136  subitem = parseBlockQuote(ii, iie);
1137  }
1138 
1139  if (!subitem)
1140  {
1141  subitem = parseCodeBlock(ii, iie);
1142  }
1143 
1144  if (subitem)
1145  {
1146  _processBlocksItems(*subitem);
1147  processed.push_back(*subitem);
1148 
1149  if (ii == iie)
1150  {
1151  break;
1152  }
1153 
1154  continue;
1155  }
1156  else
1157  {
1158  processed.push_back(*ii);
1159  }
1160  }
1161  else if ((*ii)->isContainer())
1162  {
1163  _processBlocksItems(*ii);
1164  processed.push_back(*ii);
1165  }
1166  }
1167 
1168  tokens->swapSubtokens(processed);
1169  }
1170 
1171  void Document::_processParagraphLines(TokenPtr inTokenContainer)
1172  {
1173  token::Container* tokens = dynamic_cast<token::Container*>(inTokenContainer.get());
1174  assert(tokens != 0);
1175 
1176  bool noPara = tokens->inhibitParagraphs();
1177 
1178  for (const auto& ii : tokens->subTokens())
1179  if (ii->isContainer())
1180  {
1181  _processParagraphLines(ii);
1182  }
1183 
1184  TokenGroup processed;
1185  std::string paragraphText;
1186  TokenGroup paragraphTokens;
1187 
1188  for (const auto& ii : tokens->subTokens())
1189  {
1190  if (ii->text() && ii->canContainMarkup() && !ii->inhibitParagraphs())
1191  {
1192  static const boost::regex cExpression("^(.*) $");
1193 
1194  if (!paragraphText.empty())
1195  {
1196  paragraphText += " ";
1197  }
1198 
1199  boost::smatch m;
1200 
1201  if (boost::regex_match(*ii->text(), m, cExpression))
1202  {
1203  paragraphText += m[1];
1204  flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1205  processed.push_back(TokenPtr(new markdown::token::HtmlTag("br/")));
1206  }
1207  else
1208  {
1209  paragraphText += *ii->text();
1210  }
1211  }
1212  else
1213  {
1214  flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1215  processed.push_back(ii);
1216  }
1217  }
1218 
1219  // Make sure the last paragraph is properly flushed too.
1220  flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1221 
1222  tokens->swapSubtokens(processed);
1223  }
1224 
1225 } // namespace markdown
armarx::navigation::platform_controller::platform_global_trajectory::Target
Twist2D Target
Definition: PlatformGlobalTrajectoryController.h:72
markdown::token::Header
Definition: markdown-tokens.h:259
str
std::string str(const T &t)
Definition: UserAssistedSegmenterGuiWidgetController.cpp:42
markdown::token::InlineHtmlContents
Definition: markdown-tokens.h:207
markdown::Document::writeTokens
void writeTokens(std::ostream &)
Definition: markdown.cpp:997
markdown.h
c
constexpr T c
Definition: UnscentedKalmanFilterTest.cpp:43
markdown::token::UnorderedList
Definition: markdown-tokens.h:455
armarx::detail::StreamPrinterTag::tag
@ tag
markdown::token::isValidTag
size_t isValidTag(const std::string &tag, bool nonBlockFirst)
Definition: markdown-tokens.cpp:271
convert
void convert(const std::filesystem::path &in, const std::filesystem::path &out, bool print_progress)
Performs the actual conversion.
Definition: main.cpp:154
markdown::token::Container
Definition: markdown-tokens.h:333
markdown::Document::read
bool read(const std::string &)
Definition: markdown.cpp:906
markdown::LinkIds::add
void add(const std::string &id, const std::string &url, const std::string &title)
Definition: markdown.cpp:871
markdown::TokenPtr
std::shared_ptr< Token > TokenPtr
Definition: markdown.h:21
markdown::Document::Document
Document(size_t spacesPerTab=cDefaultSpacesPerTab)
Definition: markdown.cpp:887
markdown::Document::~Document
~Document()
Definition: markdown.cpp:901
markdown::token::RawText
Definition: markdown-tokens.h:148
markdown::token::BlankLine
Definition: markdown-tokens.h:290
markdown
Definition: markdown-tokens.cpp:16
markdown::Document::write
void write(std::ostream &)
Definition: markdown.cpp:991
markdown::token::InlineHtmlComment
Definition: markdown-tokens.h:219
armarx::control::hardware_config::tagName
std::string tagName(ConfigTag tag)
Definition: Config.cpp:302
markdown::LinkIds
Definition: markdown-tokens.h:21
markdown::CTokenGroupIter
TokenGroup::const_iterator CTokenGroupIter
Definition: markdown-tokens.h:19
markdown::token::ListItem
Definition: markdown-tokens.h:416
armarx::to_string
const std::string & to_string(const std::string &s)
Definition: StringHelpers.h:40
markdown::token::OrderedList
Definition: markdown-tokens.h:480
markdown::token::Paragraph
Definition: markdown-tokens.h:531
markdown::token::CodeBlock
Definition: markdown-tokens.h:231
markdown-tokens.h
markdown::token::BlockQuote
Definition: markdown-tokens.h:506
sub
Point sub(const Point &x, const Point &y)
Definition: point.hpp:43
markdown::TokenGroup
std::list< TokenPtr > TokenGroup
Definition: markdown.h:22
markdown::token::InlineHtmlBlock
Definition: markdown-tokens.h:381
markdown::LinkIds::find
std::optional< Target > find(const std::string &id) const
Definition: markdown.cpp:857
markdown::token::Container::appendSubtokens
void appendSubtokens(TokenGroup &tokens)
Definition: markdown-tokens.h:343
markdown::token::HtmlTag
Definition: markdown-tokens.h:175