markdown.cpp
Go to the documentation of this file.
1
2/*
3 Copyright (c) 2009 by Chad Nelson
4 Released under the MIT License.
5 See the provided LICENSE.TXT file for details.
6*/
7
8#include "markdown.h"
9
10#include <cassert>
11#include <sstream>
12
13#include <boost/regex.hpp>
14
15#include <SimoxUtility/algorithm/string/string_tools.h>
16
17#include "markdown-tokens.h"
18
19
22
23namespace
24{
25
26 struct HtmlTagInfo
27 {
28 std::string tagName, extra;
29 bool isClosingTag;
30 size_t lengthOfToken; // In original string
31 };
32
33 const std::string cHtmlTokenSource(
34 "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))>");
35 const boost::regex cHtmlTokenExpression(cHtmlTokenSource),
36 cStartHtmlTokenExpression("^" + cHtmlTokenSource),
37 cOneHtmlTokenExpression("^" + cHtmlTokenSource + "$");
38
39 enum ParseHtmlTagFlags
40 {
41 cAlone,
42 cStarts
43 };
44
45 std::optional<HtmlTagInfo>
46 parseHtmlTag(std::string::const_iterator begin,
47 std::string::const_iterator end,
48 ParseHtmlTagFlags flags)
49 {
50 boost::smatch m;
51
52 if (boost::regex_search(
53 begin,
54 end,
55 m,
56 (flags == cAlone ? cOneHtmlTokenExpression : cStartHtmlTokenExpression)))
57 {
58 HtmlTagInfo r;
59 r.tagName = m[3];
60
61 if (m[4].matched)
62 {
63 r.extra = m[4];
64 }
65
66 r.isClosingTag = (m[2].length() > 0);
67 r.lengthOfToken = m[0].length();
68 return r;
69 }
70
71 return std::nullopt;
72 }
73
75 parseInlineHtmlText(const std::string& src)
76 {
78 std::string::const_iterator prev = src.begin(), end = src.end();
79
80 while (true)
81 {
82 boost::smatch m;
83
84 if (boost::regex_search(prev, end, m, cHtmlTokenExpression))
85 {
86 if (prev != m[0].first)
87 {
88 //cerr << " Non-tag (" << std::distance(prev, m[0].first) << "): " << std::string(prev, m[0].first) << endl;
89 r.push_back(TokenPtr(
90 new markdown::token::InlineHtmlContents(std::string(prev, m[0].first))));
91 }
92
93 //cerr << " Tag: " << m[1] << endl;
94 r.push_back(TokenPtr(new markdown::token::HtmlTag(m[1])));
95 prev = m[0].second;
96 }
97 else
98 {
99 std::string eol;
100
101 if (prev != end)
102 {
103 eol = std::string(prev, end);
104 //cerr << " Non-tag: " << eol << endl;
105 }
106
107 eol += '\n';
108 r.push_back(TokenPtr(new markdown::token::InlineHtmlContents(eol)));
109 break;
110 }
111 }
112
113 return r;
114 }
115
116 bool
117 isHtmlCommentStart(std::string::const_iterator begin, std::string::const_iterator end)
118 {
119 // It can't be a single-line comment, those will already have been parsed
120 // by isBlankLine.
121 static const boost::regex cExpression("^<!--");
122 return boost::regex_search(begin, end, cExpression);
123 }
124
125 bool
126 isHtmlCommentEnd(std::string::const_iterator begin, std::string::const_iterator end)
127 {
128 static const boost::regex cExpression(".*-- *>$");
129 return boost::regex_match(begin, end, cExpression);
130 }
131
132 bool
133 isBlankLine(const std::string& line)
134 {
135 static const boost::regex cExpression(" {0,3}(<--(.*)-- *> *)* *");
136 return boost::regex_match(line, cExpression);
137 }
138
139 std::optional<TokenPtr>
140 parseInlineHtml(CTokenGroupIter& i, CTokenGroupIter end)
141 {
142 // Preconditions: Previous line was blank, or this is the first line.
143 if ((*i)->text())
144 {
145 const std::string line(*(*i)->text());
146
147 bool tag = false, comment = false;
148 std::optional<HtmlTagInfo> tagInfo = parseHtmlTag(line.begin(), line.end(), cStarts);
149
150 if (tagInfo && markdown::token::isValidTag(tagInfo->tagName) > 1)
151 {
152 tag = true;
153 }
154 else if (isHtmlCommentStart(line.begin(), line.end()))
155 {
156 comment = true;
157 }
158
159 if (tag)
160 {
161 // Block continues until an HTML tag (alone) on a line followed by a
162 // blank line.
163 markdown::TokenGroup contents;
164 CTokenGroupIter firstLine = i, prevLine = i;
165 size_t lines = 0;
166
167 bool done = false;
168
169 do
170 {
171 // We encode HTML tags so that their contents gets properly
172 // handled -- i.e. "<div style=">"/>" becomes <div style="&gt;"/>
173 if ((*i)->text())
174 {
175 markdown::TokenGroup t = parseInlineHtmlText(*(*i)->text());
176 contents.splice(contents.end(), t);
177 }
178 else
179 {
180 contents.push_back(*i);
181 }
182
183 prevLine = i;
184 ++i;
185 ++lines;
186
187 if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
188 {
189 if (prevLine == firstLine)
190 {
191 done = true;
192 }
193 else
194 {
195 const std::string text(*(*prevLine)->text());
196
197 if (parseHtmlTag(text.begin(), text.end(), cAlone))
198 {
199 done = true;
200 }
201 }
202 }
203 } while (i != end && !done);
204
205 if (lines > 1 || markdown::token::isValidTag(tagInfo->tagName, true) > 1)
206 {
207 i = prevLine;
208 return TokenPtr(new markdown::token::InlineHtmlBlock(contents));
209 }
210 else
211 {
212 // Single-line HTML "blocks" whose initial tags are span-tags
213 // don't qualify as inline HTML.
214 i = firstLine;
215 return std::nullopt;
216 }
217 }
218 else if (comment)
219 {
220 // Comment continues until a closing tag is found; at present, it
221 // also has to be the last thing on the line, and has to be
222 // immediately followed by a blank line too.
223 markdown::TokenGroup contents;
224 CTokenGroupIter firstLine = i, prevLine = i;
225
226 bool done = false;
227
228 do
229 {
230 if ((*i)->text())
231 {
232 contents.push_back(
233 TokenPtr(new markdown::token::InlineHtmlComment(*(*i)->text() + '\n')));
234 }
235 else
236 {
237 contents.push_back(*i);
238 }
239
240 prevLine = i;
241 ++i;
242
243 if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
244 {
245 if (prevLine == firstLine)
246 {
247 done = true;
248 }
249 else
250 {
251 const std::string text(*(*prevLine)->text());
252
253 if (isHtmlCommentEnd(text.begin(), text.end()))
254 {
255 done = true;
256 }
257 }
258 }
259 } while (i != end && !done);
260
261 i = prevLine;
262 return TokenPtr(new markdown::token::InlineHtmlBlock(contents));
263 }
264 }
265
266 return std::nullopt;
267 }
268
269 std::optional<std::string>
270 isCodeBlockLine(CTokenGroupIter& i, CTokenGroupIter end)
271 {
272 if ((*i)->isBlankLine())
273 {
274 // If we get here, we're already in a code block.
275 ++i;
276
277 if (i != end)
278 {
279 std::optional<std::string> r = isCodeBlockLine(i, end);
280
281 if (r)
282 {
283 return std::string("\n" + *r);
284 }
285 }
286
287 --i;
288 }
289 else if ((*i)->text() && (*i)->canContainMarkup())
290 {
291 std::string line(*(*i)->text());
292
293 if (line.length() >= 4)
294 {
295 std::string::iterator si = line.begin(), sie = si + 4;
296
297 while (si != sie && *si == ' ')
298 {
299 ++si;
300 }
301
302 if (si == sie)
303 {
304 ++i;
305 return std::string(si, line.end());
306 }
307 }
308 }
309
310 return std::nullopt;
311 }
312
313 std::optional<TokenPtr>
314 parseCodeBlock(CTokenGroupIter& i, CTokenGroupIter end)
315 {
316 if (!(*i)->isBlankLine())
317 {
318 std::optional<std::string> contents = isCodeBlockLine(i, end);
319
320 if (contents)
321 {
322 std::ostringstream out;
323 out << *contents << '\n';
324
325 while (i != end)
326 {
327 contents = isCodeBlockLine(i, end);
328
329 if (contents)
330 {
331 out << *contents << '\n';
332 }
333 else
334 {
335 break;
336 }
337 }
338
339 return TokenPtr(new markdown::token::CodeBlock(out.str()));
340 }
341 }
342
343 return std::nullopt;
344 }
345
346 size_t
347 countQuoteLevel(const std::string& prefixString)
348 {
349 size_t r = 0;
350
351 for (char qi : prefixString)
352 if (qi == '>')
353 {
354 ++r;
355 }
356
357 return r;
358 }
359
360 std::optional<TokenPtr>
361 parseBlockQuote(CTokenGroupIter& i, CTokenGroupIter end)
362 {
363 static const boost::regex cBlockQuoteExpression("^((?: {0,3}>)+) (.*)$");
364 // Useful captures: 1=prefix, 2=content
365
366 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
367 {
368 const std::string line(*(*i)->text());
369 boost::smatch m;
370
371 if (boost::regex_match(line, m, cBlockQuoteExpression))
372 {
373 size_t quoteLevel = countQuoteLevel(m[1]);
374 boost::regex continuationExpression =
375 boost::regex("^((?: {0,3}>){" + std::to_string(quoteLevel) + "}) ?(.*)$");
376
377 markdown::TokenGroup subTokens;
378 subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));
379
380 // The next line can be a continuation of this quote (with or
381 // without the prefix string) or a blank line. Blank lines are
382 // treated as part of this quote if the following line is a
383 // properly-prefixed quote line too, otherwise they terminate the
384 // quote.
385 ++i;
386
387 while (i != end)
388 {
389 if ((*i)->isBlankLine())
390 {
391 CTokenGroupIter ii = i;
392 ++ii;
393
394 if (ii == end)
395 {
396 i = ii;
397 break;
398 }
399 else
400 {
401 const std::string line(*(*ii)->text());
402
403 if (boost::regex_match(line, m, continuationExpression))
404 {
405 if (m[1].matched && m[1].length() > 0)
406 {
407 i = ++ii;
408 subTokens.push_back(TokenPtr(new markdown::token::BlankLine));
409 subTokens.push_back(
411 }
412 else
413 {
414 break;
415 }
416 }
417 else
418 {
419 break;
420 }
421 }
422 }
423 else
424 {
425 const std::string line(*(*i)->text());
426
427 if (boost::regex_match(line, m, continuationExpression))
428 {
429 assert(m[2].matched);
430
431 if (!isBlankLine(m[2]))
432 {
433 subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));
434 }
435 else
436 {
437 subTokens.push_back(TokenPtr(new markdown::token::BlankLine(m[2])));
438 }
439
440 ++i;
441 }
442 else
443 {
444 break;
445 }
446 }
447 }
448
449 return TokenPtr(new markdown::token::BlockQuote(subTokens));
450 }
451 }
452
453 return std::nullopt;
454 }
455
456 std::optional<TokenPtr>
457 parseListBlock(CTokenGroupIter& i, CTokenGroupIter end, bool sub = false)
458 {
459 static const boost::regex cUnorderedListExpression("^( *)([*+-]) +([^*-].*)$");
460 static const boost::regex cOrderedListExpression("^( *)([0-9]+)\\. +(.*)$");
461
462 enum ListType
463 {
464 cNone,
465 cUnordered,
466 cOrdered
467 };
468
469 ListType type = cNone;
470
471 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
472 {
473 boost::regex nextItemExpression, startSublistExpression;
474 size_t indent = 0;
475
476 const std::string line((*i)->text().value());
477
478 //cerr << "IsList? " << line << endl;
479
480 markdown::TokenGroup subTokens, subItemTokens;
481
482 boost::smatch m;
483
484 if (boost::regex_match(line, m, cUnorderedListExpression))
485 {
486 indent = m[1].length();
487
488 if (sub || indent < 4)
489 {
490 type = cUnordered;
491 char startChar = *m[2].first;
492 subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));
493
494 std::ostringstream next;
495 next << "^" << std::string(indent, ' ') << "\\" << startChar << " +([^*-].*)$";
496 nextItemExpression = next.str();
497 }
498 }
499 else if (boost::regex_match(line, m, cOrderedListExpression))
500 {
501 indent = m[1].length();
502
503 if (sub || indent < 4)
504 {
505 type = cOrdered;
506 subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));
507
508 std::ostringstream next;
509 next << "^" << std::string(indent, ' ') << "[0-9]+\\. +(.*)$";
510 nextItemExpression = next.str();
511 }
512 }
513
514 if (type != cNone)
515 {
516 CTokenGroupIter originalI = i;
517 size_t itemCount = 1;
518 std::ostringstream sub;
519 sub << "^" << std::string(indent, ' ') << " +(([*+-])|([0-9]+\\.)) +.*$";
520 startSublistExpression = sub.str();
521
522 // There are several options for the next line. It's another item in
523 // this list (in which case this one is done); it's a continuation
524 // of this line (collect it and keep going); it's the first item in
525 // a sub-list (call this function recursively to collect it), it's
526 // the next item in the parent list (this one is ended); or it's
527 // blank.
528 //
529 // A blank line requires looking ahead. If the next line is an item
530 // for this list, then switch this list into paragraph-items mode
531 // and continue processing. If it's indented by four or more spaces
532 // (more than the list itself), then it's another continuation of
533 // the current item. Otherwise it's either a new paragraph (and this
534 // list is ended) or the beginning of a sub-list.
535 static const boost::regex cContinuedItemExpression("^ *([^ ].*)$");
536
537 boost::regex continuedAfterBlankLineExpression("^ {" + std::to_string(indent + 4) +
538 "}([^ ].*)$");
539 boost::regex codeBlockAfterBlankLineExpression("^ {" + std::to_string(indent + 8) +
540 "}(.*)$");
541
542 enum NextItemType
543 {
544 cUnknown,
545 cEndOfList,
546 cAnotherItem
547 };
548
549 NextItemType nextItem = cUnknown;
550 bool setParagraphMode = false;
551
552 ++i;
553
554 while (i != end)
555 {
556 if ((*i)->isBlankLine())
557 {
558 CTokenGroupIter ii = i;
559 ++ii;
560
561 if (ii == end)
562 {
563 i = ii;
564 nextItem = cEndOfList;
565 }
566 else if ((*ii)->text())
567 {
568 const std::string line(*(*ii)->text());
569
570 if (boost::regex_match(line, startSublistExpression))
571 {
572 setParagraphMode = true;
573 ++itemCount;
574 i = ii;
575 std::optional<TokenPtr> p = parseListBlock(i, end, true);
576 assert(p);
577 subItemTokens.push_back(*p);
578 continue;
579 }
580 else if (boost::regex_match(line, m, nextItemExpression))
581 {
582 setParagraphMode = true;
583 i = ii;
584 nextItem = cAnotherItem;
585 }
586 else if (boost::regex_match(line, m, continuedAfterBlankLineExpression))
587 {
588 assert(m[1].matched);
589 subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));
590 subItemTokens.push_back(
592 i = ++ii;
593 continue;
594 }
595 else if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression))
596 {
597 setParagraphMode = true;
598 ++itemCount;
599 assert(m[1].matched);
600 subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));
601
602 std::string codeBlock = m[1] + '\n';
603 ++ii;
604
605 while (ii != end)
606 {
607 if ((*ii)->isBlankLine())
608 {
609 CTokenGroupIter iii = ii;
610 ++iii;
611 const std::string nextLine(*(*iii)->text());
612
613 if (boost::regex_match(
614 nextLine, m, codeBlockAfterBlankLineExpression))
615 {
616 codeBlock += '\n' + m[1] + '\n';
617 ii = iii;
618 }
619 else
620 {
621 break;
622 }
623 }
624 else if ((*ii)->text())
625 {
626 const std::string line(*(*ii)->text());
627
628 if (boost::regex_match(
629 line, m, codeBlockAfterBlankLineExpression))
630 {
631 codeBlock += m[1] + '\n';
632 }
633 else
634 {
635 break;
636 }
637 }
638 else
639 {
640 break;
641 }
642
643 ++ii;
644 }
645
646 subItemTokens.push_back(
647 TokenPtr(new markdown::token::CodeBlock(codeBlock)));
648 i = ii;
649 continue;
650 }
651 else
652 {
653 nextItem = cEndOfList;
654 }
655 }
656 else
657 {
658 break;
659 }
660 }
661 else if ((*i)->text())
662 {
663 const std::string line(*(*i)->text());
664
665 if (boost::regex_match(line, startSublistExpression))
666 {
667 ++itemCount;
668 std::optional<TokenPtr> p = parseListBlock(i, end, true);
669 assert(p);
670 subItemTokens.push_back(*p);
671 continue;
672 }
673 else if (boost::regex_match(line, m, nextItemExpression))
674 {
675 nextItem = cAnotherItem;
676 }
677 else
678 {
679 if (boost::regex_match(line, m, cUnorderedListExpression) ||
680 boost::regex_match(line, m, cOrderedListExpression))
681 {
682 // Belongs to the parent list
683 nextItem = cEndOfList;
684 }
685 else
686 {
687 boost::regex_match(line, m, cContinuedItemExpression);
688 assert(m[1].matched);
689 subItemTokens.push_back(
691 ++i;
692 continue;
693 }
694 }
695 }
696 else
697 {
698 nextItem = cEndOfList;
699 }
700
701 if (!subItemTokens.empty())
702 {
703 subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));
704 subItemTokens.clear();
705 }
706
707 assert(nextItem != cUnknown);
708
709 if (nextItem == cAnotherItem)
710 {
711 subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));
712 ++itemCount;
713 ++i;
714 }
715 else // nextItem==cEndOfList
716 {
717 break;
718 }
719 }
720
721 // In case we hit the end with an unterminated item...
722 if (!subItemTokens.empty())
723 {
724 subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));
725 subItemTokens.clear();
726 }
727
728 if (itemCount > 1 || indent != 0)
729 {
730 if (type == cUnordered)
731 {
732 return TokenPtr(
733 new markdown::token::UnorderedList(subTokens, setParagraphMode));
734 }
735 else
736 {
737 return TokenPtr(
738 new markdown::token::OrderedList(subTokens, setParagraphMode));
739 }
740 }
741 else
742 {
743 // It looked like a list, but turned out to be a false alarm.
744 i = originalI;
745 return std::nullopt;
746 }
747 }
748 }
749
750 return std::nullopt;
751 }
752
753 bool
754 parseReference(CTokenGroupIter& i, CTokenGroupIter end, markdown::LinkIds& idTable)
755 {
756 if ((*i)->text())
757 {
758 static const boost::regex cReference(
759 "^ {0,3}\\[(.+)\\]: +<?([^ >]+)>?(?: *(?:('|\")(.*)\\3)|(?:\\((.*)\\)))?$");
760 // Useful captures: 1=id, 2=url, 4/5=title
761
762 const std::string line1(*(*i)->text());
763 boost::smatch m;
764
765 if (boost::regex_match(line1, m, cReference))
766 {
767 std::string id(m[1]), url(m[2]), title;
768
769 if (m[4].matched)
770 {
771 title = m[4];
772 }
773 else if (m[5].matched)
774 {
775 title = m[5];
776 }
777 else
778 {
779 CTokenGroupIter ii = i;
780 ++ii;
781
782 if (ii != end && (*ii)->text())
783 {
784 // It could be on the next line
785 static const boost::regex cSeparateTitle(
786 "^ *(?:(?:('|\")(.*)\\1)|(?:\\((.*)\\))) *$");
787 // Useful Captures: 2/3=title
788
789 const std::string line2(*(*ii)->text());
790
791 if (boost::regex_match(line2, m, cSeparateTitle))
792 {
793 ++i;
794 title = (m[2].matched ? m[2] : m[3]);
795 }
796 }
797 }
798
799 idTable.add(id, url, title);
800 return true;
801 }
802 }
803
804 return false;
805 }
806
807 void
808 flushParagraph(std::string& paragraphText,
809 markdown::TokenGroup& paragraphTokens,
810 markdown::TokenGroup& finalTokens,
811 bool noParagraphs)
812 {
813 if (!paragraphText.empty())
814 {
815 paragraphTokens.push_back(TokenPtr(new markdown::token::RawText(paragraphText)));
816 paragraphText.clear();
817 }
818
819 if (!paragraphTokens.empty())
820 {
821 if (noParagraphs)
822 {
823 if (paragraphTokens.size() > 1)
824 {
825 finalTokens.push_back(
826 TokenPtr(new markdown::token::Container(paragraphTokens)));
827 }
828 else
829 {
830 finalTokens.push_back(*paragraphTokens.begin());
831 }
832 }
833 else
834 {
835 finalTokens.push_back(TokenPtr(new markdown::token::Paragraph(paragraphTokens)));
836 }
837
838 paragraphTokens.clear();
839 }
840 }
841
842 std::optional<TokenPtr>
843 parseHeader(CTokenGroupIter& i, CTokenGroupIter end)
844 {
845 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
846 {
847 // Hash-mark type
848 static const boost::regex cHashHeaders("^(#{1,6}) +(.*?) *#*$");
849 const std::string line = *(*i)->text();
850 boost::smatch m;
851
852 if (boost::regex_match(line, m, cHashHeaders))
853 {
854 return TokenPtr(new markdown::token::Header(m[1].length(), m[2]));
855 }
856
857 // Underlined type
858 CTokenGroupIter ii = i;
859 ++ii;
860
861 if (ii != end && !(*ii)->isBlankLine() && (*ii)->text() && (*ii)->canContainMarkup())
862 {
863 static const boost::regex cUnderlinedHeaders("^([-=])\\1*$");
864 const std::string line = *(*ii)->text();
865
866 if (boost::regex_match(line, m, cUnderlinedHeaders))
867 {
868 char typeChar = std::string(m[1])[0];
869 TokenPtr p = TokenPtr(
870 new markdown::token::Header((typeChar == '=' ? 1 : 2), *(*i)->text()));
871 i = ii;
872 return p;
873 }
874 }
875 }
876
877 return std::nullopt;
878 }
879
880 std::optional<TokenPtr>
881 parseHorizontalRule(CTokenGroupIter& i, CTokenGroupIter end)
882 {
883 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
884 {
885 static const boost::regex cHorizontalRules("^ {0,3}((?:-|\\*|_) *){3,}$");
886 const std::string line = *(*i)->text();
887
888 if (boost::regex_match(line, cHorizontalRules))
889 {
890 return TokenPtr(new markdown::token::HtmlTag("hr/"));
891 }
892 }
893
894 return std::nullopt;
895 }
896
897} // namespace
898
899namespace markdown
900{
901
902 std::optional<LinkIds::Target>
903 LinkIds::find(const std::string& id) const
904 {
905 Table::const_iterator i = mTable.find(_scrubKey(id));
906
907 if (i != mTable.end())
908 {
909 return i->second;
910 }
911 else
912 {
913 return std::nullopt;
914 }
915 }
916
917 void
918 LinkIds::add(const std::string& id, const std::string& url, const std::string& title)
919 {
920 mTable.insert(std::make_pair(_scrubKey(id), Target(url, title)));
921 }
922
923 std::string
924 LinkIds::_scrubKey(std::string str)
925 {
926 return simox::alg::to_lower(str);
927 }
928
929 const size_t Document::cSpacesPerInitialTab = 4; // Required by Markdown format
930 const size_t Document::cDefaultSpacesPerTab = cSpacesPerInitialTab;
931
932 Document::Document(size_t spacesPerTab) :
933 cSpacesPerTab(spacesPerTab),
934 mTokenContainer(new token::Container),
935 mIdTable(new LinkIds),
936 mProcessed(false)
937 {
938 // This space deliberately blank ;-)
939 }
940
941 Document::Document(std::istream& in, size_t spacesPerTab) :
942 cSpacesPerTab(spacesPerTab),
943 mTokenContainer(new token::Container),
944 mIdTable(new LinkIds),
945 mProcessed(false)
946 {
947 read(in);
948 }
949
951 {
952 delete mIdTable;
953 }
954
955 bool
956 Document::read(const std::string& src)
957 {
958 std::istringstream in(src);
959 return read(in);
960 }
961
962 bool
963 Document::_getline(std::istream& in, std::string& line)
964 {
965 // Handles \n, \r, and \r\n (and even \n\r) on any system. Also does tab-
966 // expansion, since this is the most efficient place for it.
967 line.clear();
968
969 bool initialWhitespace = true;
970 char c;
971
972 while (in.get(c))
973 {
974 if (c == '\r')
975 {
976 if ((in.get(c)) && c != '\n')
977 {
978 in.unget();
979 }
980
981 return true;
982 }
983 else if (c == '\n')
984 {
985 if ((in.get(c)) && c != '\r')
986 {
987 in.unget();
988 }
989
990 return true;
991 }
992 else if (c == '\t')
993 {
994 size_t convert = (initialWhitespace ? cSpacesPerInitialTab : cSpacesPerTab);
995 line += std::string(convert - (line.length() % convert), ' ');
996 }
997 else
998 {
999 line.push_back(c);
1000
1001 if (c != ' ')
1002 {
1003 initialWhitespace = false;
1004 }
1005 }
1006 }
1007
1008 return !line.empty();
1009 }
1010
1011 bool
1012 Document::read(std::istream& in)
1013 {
1014 if (mProcessed)
1015 {
1016 return false;
1017 }
1018
1019 token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1020 assert(tokens != 0);
1021
1022 std::string line;
1023 TokenGroup tgt;
1024
1025 while (_getline(in, line))
1026 {
1027 if (isBlankLine(line))
1028 {
1029 tgt.push_back(TokenPtr(new token::BlankLine(line)));
1030 }
1031 else
1032 {
1033 tgt.push_back(TokenPtr(new token::RawText(line)));
1034 }
1035 }
1036
1037 tokens->appendSubtokens(tgt);
1038
1039 return true;
1040 }
1041
1042 void
1043 Document::write(std::ostream& out)
1044 {
1045 _process();
1046 mTokenContainer->writeAsHtml(out);
1047 }
1048
1049 void
1050 Document::writeTokens(std::ostream& out)
1051 {
1052 _process();
1053 mTokenContainer->writeToken(0, out);
1054 }
1055
1056 void
1057 Document::_process()
1058 {
1059 if (!mProcessed)
1060 {
1061 _mergeMultilineHtmlTags();
1062 _processInlineHtmlAndReferences();
1063 _processBlocksItems(mTokenContainer);
1064 _processParagraphLines(mTokenContainer);
1065 mTokenContainer->processSpanElements(*mIdTable);
1066 mProcessed = true;
1067 }
1068 }
1069
1070 void
1071 Document::_mergeMultilineHtmlTags()
1072 {
1073 static const boost::regex cHtmlTokenStart(
1074 "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))$");
1075 static const boost::regex cHtmlTokenEnd(
1076 "^ *((?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\3))*? */? *))>");
1077
1078 TokenGroup processed;
1079
1080 token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1081 assert(tokens != 0);
1082
1083 for (TokenGroup::const_iterator i = tokens->subTokens().begin(),
1084 ie = tokens->subTokens().end();
1085 i != ie;
1086 ++i)
1087 {
1088 if ((*i)->text() && boost::regex_match(*(*i)->text(), cHtmlTokenStart))
1089 {
1090 TokenGroup::const_iterator i2 = i;
1091 ++i2;
1092
1093 if (i2 != tokens->subTokens().end() && (*i2)->text() &&
1094 boost::regex_match(*(*i2)->text(), cHtmlTokenEnd))
1095 {
1096 processed.push_back(TokenPtr(
1097 new markdown::token::RawText(*(*i)->text() + ' ' + *(*i2)->text())));
1098 ++i;
1099 continue;
1100 }
1101 }
1102
1103 processed.push_back(*i);
1104 }
1105
1106 tokens->swapSubtokens(processed);
1107 }
1108
1109 void
1110 Document::_processInlineHtmlAndReferences()
1111 {
1112 TokenGroup processed;
1113
1114 token::Container* tokens = dynamic_cast<token::Container*>(mTokenContainer.get());
1115 assert(tokens != 0);
1116
1117 for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1118 iie = tokens->subTokens().end();
1119 ii != iie;
1120 ++ii)
1121 {
1122 if ((*ii)->text())
1123 {
1124 if (processed.empty() || processed.back()->isBlankLine())
1125 {
1126 std::optional<TokenPtr> inlineHtml = parseInlineHtml(ii, iie);
1127
1128 if (inlineHtml)
1129 {
1130 processed.push_back(*inlineHtml);
1131
1132 if (ii == iie)
1133 {
1134 break;
1135 }
1136
1137 continue;
1138 }
1139 }
1140
1141 if (parseReference(ii, iie, *mIdTable))
1142 {
1143 if (ii == iie)
1144 {
1145 break;
1146 }
1147
1148 continue;
1149 }
1150
1151 // If it gets down here, just store it in its current (raw text)
1152 // form. We'll group the raw text lines into paragraphs in a
1153 // later pass, since we can't easily tell where paragraphs
1154 // end until then.
1155 }
1156
1157 processed.push_back(*ii);
1158 }
1159
1160 tokens->swapSubtokens(processed);
1161 }
1162
1163 void
1164 Document::_processBlocksItems(TokenPtr inTokenContainer)
1165 {
1166 if (!inTokenContainer->isContainer())
1167 {
1168 return;
1169 }
1170
1171 token::Container* tokens = dynamic_cast<token::Container*>(inTokenContainer.get());
1172 assert(tokens != 0);
1173
1174 TokenGroup processed;
1175
1176 for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1177 iie = tokens->subTokens().end();
1178 ii != iie;
1179 ++ii)
1180 {
1181 if ((*ii)->text())
1182 {
1183 std::optional<TokenPtr> subitem;
1184
1185 if (!subitem)
1186 {
1187 subitem = parseHeader(ii, iie);
1188 }
1189
1190 if (!subitem)
1191 {
1192 subitem = parseHorizontalRule(ii, iie);
1193 }
1194
1195 if (!subitem)
1196 {
1197 subitem = parseListBlock(ii, iie);
1198 }
1199
1200 if (!subitem)
1201 {
1202 subitem = parseBlockQuote(ii, iie);
1203 }
1204
1205 if (!subitem)
1206 {
1207 subitem = parseCodeBlock(ii, iie);
1208 }
1209
1210 if (subitem)
1211 {
1212 _processBlocksItems(*subitem);
1213 processed.push_back(*subitem);
1214
1215 if (ii == iie)
1216 {
1217 break;
1218 }
1219
1220 continue;
1221 }
1222 else
1223 {
1224 processed.push_back(*ii);
1225 }
1226 }
1227 else if ((*ii)->isContainer())
1228 {
1229 _processBlocksItems(*ii);
1230 processed.push_back(*ii);
1231 }
1232 }
1233
1234 tokens->swapSubtokens(processed);
1235 }
1236
1237 void
1238 Document::_processParagraphLines(TokenPtr inTokenContainer)
1239 {
1240 token::Container* tokens = dynamic_cast<token::Container*>(inTokenContainer.get());
1241 assert(tokens != 0);
1242
1243 bool noPara = tokens->inhibitParagraphs();
1244
1245 for (const auto& ii : tokens->subTokens())
1246 if (ii->isContainer())
1247 {
1248 _processParagraphLines(ii);
1249 }
1250
1251 TokenGroup processed;
1252 std::string paragraphText;
1253 TokenGroup paragraphTokens;
1254
1255 for (const auto& ii : tokens->subTokens())
1256 {
1257 if (ii->text() && ii->canContainMarkup() && !ii->inhibitParagraphs())
1258 {
1259 static const boost::regex cExpression("^(.*) $");
1260
1261 if (!paragraphText.empty())
1262 {
1263 paragraphText += " ";
1264 }
1265
1266 boost::smatch m;
1267
1268 if (boost::regex_match(*ii->text(), m, cExpression))
1269 {
1270 paragraphText += m[1];
1271 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1272 processed.push_back(TokenPtr(new markdown::token::HtmlTag("br/")));
1273 }
1274 else
1275 {
1276 paragraphText += *ii->text();
1277 }
1278 }
1279 else
1280 {
1281 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1282 processed.push_back(ii);
1283 }
1284 }
1285
1286 // Make sure the last paragraph is properly flushed too.
1287 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1288
1289 tokens->swapSubtokens(processed);
1290 }
1291
1292} // namespace markdown
constexpr T c
std::string str(const T &t)
void convert(const std::filesystem::path &in, const std::filesystem::path &out, bool print_progress)
Performs the actual conversion.
Definition main.cpp:170
Document(size_t spacesPerTab=cDefaultSpacesPerTab)
Definition markdown.cpp:932
void write(std::ostream &)
void writeTokens(std::ostream &)
bool read(const std::string &)
Definition markdown.cpp:956
std::optional< Target > find(const std::string &id) const
Definition markdown.cpp:903
void add(const std::string &id, const std::string &url, const std::string &title)
Definition markdown.cpp:918
void appendSubtokens(TokenGroup &tokens)
size_t isValidTag(const std::string &tag, bool nonBlockFirst)
std::shared_ptr< Token > TokenPtr
Definition markdown.h:21
std::list< TokenPtr > TokenGroup
Definition markdown.h:22
TokenGroup::const_iterator CTokenGroupIter
Point sub(const Point &x, const Point &y)
Definition point.hpp:46