markdown-tokens.cpp
Go to the documentation of this file.
1
2/*
3 Copyright (c) 2009 by Chad Nelson
4 Released under the MIT License.
5 See the provided LICENSE.TXT file for details.
6*/
7
8#include "markdown-tokens.h"
9
10#include <set>
11#include <sstream>
12#include <stack>
13
14#include <boost/regex.hpp>
15
17{
18
19 namespace
20 {
21
22 const std::string cEscapedCharacters("\\`*_{}[]()#+-.!>");
23
24 std::optional<size_t>
25 isEscapedCharacter(char c)
26 {
27 std::string::const_iterator i =
28 std::find(cEscapedCharacters.begin(), cEscapedCharacters.end(), c);
29
30 if (i != cEscapedCharacters.end())
31 {
32 return std::distance(cEscapedCharacters.begin(), i);
33 }
34 else
35 {
36 return std::nullopt;
37 }
38 }
39
40 char
41 escapedCharacter(size_t index)
42 {
43 return cEscapedCharacters[index];
44 }
45
46 std::string
47 encodeString(const std::string& src, int encodingFlags)
48 {
49 bool amps = (encodingFlags & cAmps) != 0,
50 doubleAmps = (encodingFlags & cDoubleAmps) != 0,
51 angleBrackets = (encodingFlags & cAngles) != 0,
52 quotes = (encodingFlags & cQuotes) != 0;
53
54 std::string tgt;
55
56 for (std::string::const_iterator i = src.begin(), ie = src.end(); i != ie; ++i)
57 {
58 if (*i == '&' && amps)
59 {
60 static const boost::regex cIgnore(
61 "^(&amp;)|(&#[0-9]{1,3};)|(&#[xX][0-9a-fA-F]{1,2};)");
62
63 if (boost::regex_search(i, ie, cIgnore))
64 {
65 tgt.push_back(*i);
66 }
67 else
68 {
69 tgt += "&amp;";
70 }
71 }
72 else if (*i == '&' && doubleAmps)
73 {
74 tgt += "&amp;";
75 }
76 else if (*i == '<' && angleBrackets)
77 {
78 tgt += "&lt;";
79 }
80 else if (*i == '>' && angleBrackets)
81 {
82 tgt += "&gt;";
83 }
84 else if (*i == '\"' && quotes)
85 {
86 tgt += "&quot;";
87 }
88 else
89 {
90 tgt.push_back(*i);
91 }
92 }
93
94 return tgt;
95 }
96
97 bool
98 looksLikeUrl(const std::string& str)
99 {
100 const char* schemes[] = {
101 "http://", "https://", "ftp://", "ftps://", "file://", "www.", "ftp.", nullptr};
102
103 for (size_t x = 0; schemes[x] != nullptr; ++x)
104 {
105 const char *s = str.c_str(), *t = schemes[x];
106
107 while (*s != 0 && *t != 0 && *s == *t)
108 {
109 ++s;
110 ++t;
111 }
112
113 if (*t == 0)
114 {
115 return true;
116 }
117 }
118
119 return false;
120 }
121
122 bool
123 notValidNameCharacter(char c)
124 {
125 return !(isalnum(c) || c == '.' || c == '_' || c == '%' || c == '-' || c == '+');
126 }
127
128 bool
129 notValidSiteCharacter(char c)
130 {
131 // NOTE: Kludge alert! The official spec for site characters is only
132 // "a-zA-Z._%-". However, MDTest supports "international domain names,"
133 // which use characters other than that; I'm kind of cheating here, handling
134 // those by allowing all utf8-encoded characters too.
135 return !(isalnum(c) || c == '.' || c == '_' || c == '%' || c == '-' || (c & 0x80));
136 }
137
138 bool
139 isNotAlpha(char c)
140 {
141 return !isalpha(c);
142 }
143
144 std::string
145 emailEncode(const std::string& src)
146 {
147 std::ostringstream out;
148 bool inHex = false;
149
150 for (char i : src)
151 {
152 if (i & 0x80)
153 {
154 out << i;
155 }
156 else if (inHex)
157 {
158 out << "&#x" << std::hex << static_cast<int>(i) << ';';
159 }
160 else
161 {
162 out << "&#" << std::dec << static_cast<int>(i) << ';';
163 }
164
165 inHex = !inHex;
166 }
167
168 return out.str();
169 }
170
171 bool
172 looksLikeEmailAddress(const std::string& str)
173 {
174 using Iter = std::string::const_iterator;
175 using RIter = std::string::const_reverse_iterator;
176 Iter i = std::find_if(str.begin(), str.end(), notValidNameCharacter);
177
178 if (i != str.end() && *i == '@' && i != str.begin())
179 {
180 // The name part is valid.
181 i = std::find_if(i + 1, str.end(), notValidSiteCharacter);
182
183 if (i == str.end())
184 {
185 // The site part doesn't contain any invalid characters.
186 RIter ri = std::find_if(str.rbegin(), str.rend(), isNotAlpha);
187
188 if (ri != str.rend() && *ri == '.')
189 {
190 // It ends with a dot and only alphabetic characters.
191 size_t d = std::distance(ri.base(), str.end());
192
193 if (d >= 2 && d <= 4)
194 {
195 // There are two-to-four of them. It's valid.
196 return true;
197 }
198 }
199 }
200 }
201
202 return false;
203 }
204
205 // From <http://en.wikipedia.org/wiki/HTML_element>
206
207 const char* cOtherTagInit[] = {
208 // Header tags
209 "title/",
210 "base",
211 "link",
212 "basefont",
213 "script/",
214 "style/",
215 "object/",
216 "meta",
217
218 // Inline tags
219 "em/",
220 "strong/",
221 "q/",
222 "cite/",
223 "dfn/",
224 "abbr/",
225 "acronym/",
226 "code/",
227 "samp/",
228 "kbd/",
229 "var/",
230 "sub/",
231 "sup/",
232 "del/",
233 "ins/",
234 "isindex",
235 "a/",
236 "img",
237 "br",
238 "map/",
239 "area",
240 "object/",
241 "param",
242 "applet/",
243 "span/",
244
245 nullptr};
246
247 const char* cBlockTagInit[] = {"p/", "blockquote/",
248 "hr", "h1/",
249 "h2/", "h3/",
250 "h4/", "h5/",
251 "h6/", "dl/",
252 "dt/", "dd/",
253 "ol/", "ul/",
254 "li/", "dir/",
255 "menu/", "table/",
256 "tr/", "th/",
257 "td/", "col",
258 "colgroup/", "caption/",
259 "thead/", "tbody/",
260 "tfoot/", "form/",
261 "select/", "option",
262 "input", "label/",
263 "textarea/", "div/",
264 "pre/", "address/",
265 "iframe/", "frame/",
266 "frameset/", "noframes/",
267 "center/", "b/",
268 "i/", "big/",
269 "small/", /*"s/",*/ "strike/",
270 "tt/", "u/",
271 "font/", "ins/",
272 "del/", nullptr};
273
274 // Other official ones (not presently in use in this code)
275 //"!doctype", "bdo", "body", "button", "fieldset", "head", "html",
276 //"legend", "noscript", "optgroup", "xmp",
277
278 std::set<std::string> otherTags, blockTags;
279
280 void
281 initTag(std::set<std::string>& set, const char* init[])
282 {
283 for (size_t x = 0; init[x] != nullptr; ++x)
284 {
285 std::string str = init[x];
286
287 if (*str.rbegin() == '/')
288 {
289 // Means it can have a closing tag
290 str = str.substr(0, str.length() - 1);
291 }
292
293 set.insert(str);
294 }
295 }
296
297 std::string
298 cleanTextLinkRef(const std::string& ref)
299 {
300 std::string r;
301
302 for (char i : ref)
303 {
304 if (i == ' ')
305 {
306 if (r.empty() || *r.rbegin() != ' ')
307 {
308 r.push_back(' ');
309 }
310 }
311 else
312 {
313 r.push_back(i);
314 }
315 }
316
317 return r;
318 }
319
320 } // namespace
321
322 size_t
323 isValidTag(const std::string& tag, bool nonBlockFirst)
324 {
325 if (blockTags.empty())
326 {
327 initTag(otherTags, cOtherTagInit);
328 initTag(blockTags, cBlockTagInit);
329 }
330
331 if (nonBlockFirst)
332 {
333 if (otherTags.find(tag) != otherTags.end())
334 {
335 return 1;
336 }
337
338 if (blockTags.find(tag) != blockTags.end())
339 {
340 return 2;
341 }
342 }
343 else
344 {
345 if (blockTags.find(tag) != blockTags.end())
346 {
347 return 2;
348 }
349
350 if (otherTags.find(tag) != otherTags.end())
351 {
352 return 1;
353 }
354 }
355
356 return 0;
357 }
358
359 void
360 TextHolder::writeAsHtml(std::ostream& out) const
361 {
362 preWrite(out);
363
364 if (mEncodingFlags != 0)
365 {
366 out << encodeString(mText, mEncodingFlags);
367 }
368 else
369 {
370 out << mText;
371 }
372
373 postWrite(out);
374 }
375
376 std::optional<TokenGroup>
378 {
379 if (!canContainMarkup())
380 {
381 return std::nullopt;
382 }
383
384 ReplacementTable replacements;
385 std::string str = _processHtmlTagAttributes(*text(), replacements);
386 str = _processCodeSpans(str, replacements);
387 str = _processEscapedCharacters(str);
388 str = _processLinksImagesAndTags(str, replacements, idTable);
389 return _processBoldAndItalicSpans(str, replacements);
390 }
391
392 std::string
393 RawText::_processHtmlTagAttributes(std::string src, ReplacementTable& replacements)
394 {
395 // Because "Attribute Content Is Not A Code Span"
396 std::string tgt;
397 std::string::const_iterator prev = src.begin(), end = src.end();
398
399 while (true)
400 {
401 static const boost::regex cHtmlToken(
402 "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))+? */? *))>");
403 boost::smatch m;
404
405 if (boost::regex_search(prev, end, m, cHtmlToken))
406 {
407 // NOTE: Kludge alert! The `isValidTag` test is a cheat, only here
408 // to handle some edge cases between the Markdown test suite and the
409 // PHP-Markdown one, which seem to conflict.
410 if (isValidTag(m[3]))
411 {
412 tgt += std::string(prev, m[0].first);
413
414 std::string fulltag = m[0], tgttag;
415 std::string::const_iterator prevtag = fulltag.begin(), endtag = fulltag.end();
416
417 while (true)
418 {
419 static const boost::regex cAttributeStrings("= ?(\"|').*?\\1");
420 boost::smatch mtag;
421
422 if (boost::regex_search(prevtag, endtag, mtag, cAttributeStrings))
423 {
424 tgttag += std::string(prevtag, mtag[0].first);
425 tgttag +=
426 "\x01@" + std::to_string(replacements.size()) + "@htmlTagAttr\x01";
427 prevtag = mtag[0].second;
428
429 replacements.push_back(TokenPtr(
430 new TextHolder(std::string(mtag[0]), false, cAmps | cAngles)));
431 }
432 else
433 {
434 tgttag += std::string(prevtag, endtag);
435 break;
436 }
437 }
438
439 tgt += tgttag;
440 prev = m[0].second;
441 }
442 else
443 {
444 tgt += std::string(prev, m[0].second);
445 prev = m[0].second;
446 }
447 }
448 else
449 {
450 tgt += std::string(prev, end);
451 break;
452 }
453 }
454
455 return tgt;
456 }
457
458 std::string
459 RawText::_processCodeSpans(std::string src, ReplacementTable& replacements)
460 {
461 static const boost::regex cCodeSpan[2] = {boost::regex("(?:^|(?<=[^\\\\]))`` (.+?) ``"),
462 boost::regex("(?:^|(?<=[^\\\\]))`(.+?)`")};
463
464 for (const auto& pass : cCodeSpan)
465 {
466 std::string tgt;
467 std::string::const_iterator prev = src.begin(), end = src.end();
468
469 while (true)
470 {
471 boost::smatch m;
472
473 if (boost::regex_search(prev, end, m, pass))
474 {
475 tgt += std::string(prev, m[0].first);
476 tgt += "\x01@" + std::to_string(replacements.size()) + "@codeSpan\x01";
477 prev = m[0].second;
478 replacements.push_back(
479 TokenPtr(new CodeSpan(_restoreProcessedItems(m[1], replacements))));
480 }
481 else
482 {
483 tgt += std::string(prev, end);
484 break;
485 }
486 }
487
488 src.swap(tgt);
489 tgt.clear();
490 }
491
492 return src;
493 }
494
495 std::string
496 RawText::_processEscapedCharacters(const std::string& src)
497 {
498 std::string tgt;
499 std::string::const_iterator prev = src.begin(), end = src.end();
500
501 while (true)
502 {
503 std::string::const_iterator i = std::find(prev, end, '\\');
504
505 if (i != end)
506 {
507 tgt += std::string(prev, i);
508 ++i;
509
510 if (i != end)
511 {
512 std::optional<size_t> e = isEscapedCharacter(*i);
513
514 if (e)
515 {
516 tgt += "\x01@#" + std::to_string(*e) + "@escaped\x01";
517 }
518 else
519 {
520 tgt = tgt + '\\' + *i;
521 }
522
523 prev = i + 1;
524 }
525 else
526 {
527 tgt += '\\';
528 break;
529 }
530 }
531 else
532 {
533 tgt += std::string(prev, end);
534 break;
535 }
536 }
537
538 return tgt;
539 }
540
541 std::string
542 RawText::_processSpaceBracketedGroupings(const std::string& src, ReplacementTable& replacements)
543 {
544 static const boost::regex cRemove("(?:(?: \\*+ )|(?: _+ ))");
545
546 std::string tgt;
547 std::string::const_iterator prev = src.begin(), end = src.end();
548
549 while (true)
550 {
551 boost::smatch m;
552
553 if (boost::regex_search(prev, end, m, cRemove))
554 {
555 tgt += std::string(prev, m[0].first);
556 tgt += "\x01@" + std::to_string(replacements.size()) + "@spaceBracketed\x01";
557 replacements.push_back(TokenPtr(new RawText(m[0])));
558 prev = m[0].second;
559 }
560 else
561 {
562 tgt += std::string(prev, end);
563 break;
564 }
565 }
566
567 return tgt;
568 }
569
570 std::string
571 RawText::_processLinksImagesAndTags(const std::string& src,
572 ReplacementTable& replacements,
573 const LinkIds& idTable)
574 {
575 // NOTE: Kludge alert! The "inline link or image" regex should be...
576 //
577 // "(?:(!?)\\[(.+?)\\] *\\‍((.*?)\\‍))"
578 //
579 // ...but that fails on the 'Images' test because it includes a "stupid URL"
580 // that has parentheses within it. The proper way to deal with this would be
581 // to match any nested parentheses, but regular expressions can't handle an
582 // unknown number of nested items, so I'm cheating -- the regex for it
583 // allows for one (and *only* one) pair of matched parentheses within the
584 // URL. It makes the regex hard to follow (it was even harder to get right),
585 // but it allows it to pass the test.
586 //
587 // The "reference link or image" one has a similar problem; it should be...
588 //
589 // "|(?:(!?)\\[(.+?)\\](?: *\\[(.*?)\\])?)"
590 //
591 static const boost::regex cExpression(
592 "(?:(!?)\\[([^\\]]+?)\\] *\\(([^\\(]*(?:\\(.*?\\).*?)*?)\\))" // Inline link or image
593 "|(?:(!?)\\[((?:[^]]*?\\[.*?\\].*?)|(?:.+?))\\](?: *\\[(.*?)\\])?)" // Reference link or image
594 "|(?:<(/?([a-zA-Z0-9]+).*?)>)" // potential HTML tag or auto-link
595 );
596 // Important captures: 1/4=image indicator, 2/5=contents/alttext,
597 // 3=URL/title, 6=optional link ID, 7=potential HTML tag or auto-link
598 // contents, 8=actual tag from 7.
599
600 std::string tgt;
601 std::string::const_iterator prev = src.begin(), end = src.end();
602
603 while (true)
604 {
605 boost::smatch m;
606
607 if (boost::regex_search(prev, end, m, cExpression))
608 {
609 assert(m[0].matched);
610 assert(m[0].length() != 0);
611
612 tgt += std::string(prev, m[0].first);
613 tgt += "\x01@" + std::to_string(replacements.size()) + "@links&Images1\x01";
614 prev = m[0].second;
615
616 bool isImage = false, isLink = false, isReference = false;
617
618 if (m[4].matched && m[4].length())
619 {
620 isImage = isReference = true;
621 }
622 else if (m[1].matched && m[1].length())
623 {
624 isImage = true;
625 }
626 else if (m[5].matched)
627 {
628 isLink = isReference = true;
629 }
630 else if (m[2].matched)
631 {
632 isLink = true;
633 }
634
635 if (isImage || isLink)
636 {
637 std::string contentsOrAlttext, url, title;
638 bool resolved = false;
639
640 if (isReference)
641 {
642 contentsOrAlttext = m[5];
643 std::string linkId = (m[6].matched ? std::string(m[6]) : std::string());
644
645 if (linkId.empty())
646 {
647 linkId = cleanTextLinkRef(contentsOrAlttext);
648 }
649
650 std::optional<markdown::LinkIds::Target> target = idTable.find(linkId);
651
652 if (target)
653 {
654 url = target->url;
655 title = target->title;
656 resolved = true;
657 };
658 }
659 else
660 {
661 static const boost::regex cReference(
662 "^<?([^ >]*)>?(?: *(?:('|\")(.*)\\2)|(?:\\((.*)\\)))? *$");
663 // Useful captures: 1=url, 3/4=title
664 contentsOrAlttext = m[2];
665 std::string urlAndTitle = m[3];
666 boost::smatch mm;
667
668 if (boost::regex_match(urlAndTitle, mm, cReference))
669 {
670 url = mm[1];
671
672 if (mm[3].matched)
673 {
674 title = mm[3];
675 }
676 else if (mm[4].matched)
677 {
678 title = mm[4];
679 }
680
681 resolved = true;
682 }
683 }
684
685 if (!resolved)
686 {
687 // Just encode the first character as-is, and continue
688 // searching after it.
689 prev = m[0].first + 1;
690 replacements.push_back(
691 TokenPtr(new RawText(std::string(m[0].first, prev))));
692 }
693 else if (isImage)
694 {
695 replacements.push_back(TokenPtr(new Image(contentsOrAlttext, url, title)));
696 }
697 else
698 {
699 replacements.push_back(TokenPtr(new HtmlAnchorTag(url, title)));
700 tgt += contentsOrAlttext;
701 tgt += "\x01@" + std::to_string(replacements.size()) + "@links&Images2\x01";
702 replacements.push_back(TokenPtr(new HtmlTag("/a")));
703 }
704 }
705 else
706 {
707 // Otherwise it's an HTML tag or auto-link.
708 std::string contents = m[7];
709
710 // cerr << "Evaluating potential HTML or auto-link: " << contents << endl;
711 // cerr << "m[8]=" << m[8] << endl;
712
713 if (looksLikeUrl(contents))
714 {
715 TokenGroup subgroup;
716 subgroup.push_back(TokenPtr(new HtmlAnchorTag(contents)));
717 subgroup.push_back(TokenPtr(new RawText(contents, false)));
718 subgroup.push_back(TokenPtr(new HtmlTag("/a")));
719 replacements.push_back(TokenPtr(new Container(subgroup)));
720 }
721 else if (looksLikeEmailAddress(contents))
722 {
723 TokenGroup subgroup;
724 subgroup.push_back(
725 TokenPtr(new HtmlAnchorTag(emailEncode("mailto:" + contents))));
726 subgroup.push_back(TokenPtr(new RawText(emailEncode(contents), false)));
727 subgroup.push_back(TokenPtr(new HtmlTag("/a")));
728 replacements.push_back(TokenPtr(new Container(subgroup)));
729 }
730 else if (isValidTag(m[8]))
731 {
732 replacements.push_back(
733 TokenPtr(new HtmlTag(_restoreProcessedItems(contents, replacements))));
734 }
735 else
736 {
737 // Just encode it as-is
738 replacements.push_back(TokenPtr(new RawText(m[0])));
739 }
740 }
741 }
742 else
743 {
744 tgt += std::string(prev, end);
745 break;
746 }
747 }
748
749 return tgt;
750 }
751
753 RawText::_processBoldAndItalicSpans(const std::string& src, ReplacementTable& replacements)
754 {
755 static const boost::regex cEmphasisExpression(
756 "(?:(?<![*_])([*_]{1,3})([^*_ ]+?)\\1(?![*_]))" // Mid-word emphasis
757 "|((?:(?<!\\*)\\*{1,3}(?!\\*)|(?<!_)_{1,3}(?!_))(?=.)(?! "
758 ")(?![.,:;] )(?![.,:;]$))" // Open
759 "|((?<![* ])\\*{1,3}(?!\\*)|(?<![ _])_{1,3}(?!_))" // Close
760 );
761
762 TokenGroup tgt;
763 std::string::const_iterator i = src.begin(), end = src.end(), prev = i;
764
765 while (true)
766 {
767 boost::smatch m;
768
769 if (boost::regex_search(prev, end, m, cEmphasisExpression))
770 {
771 if (prev != m[0].first)
772 tgt.push_back(TokenPtr(new RawText(std::string(prev, m[0].first))));
773
774 if (m[3].matched)
775 {
776 std::string token = m[3];
777 tgt.push_back(TokenPtr(new BoldOrItalicMarker(true, token[0], token.length())));
778 prev = m[0].second;
779 }
780 else if (m[4].matched)
781 {
782 std::string token = m[4];
783 tgt.push_back(
784 TokenPtr(new BoldOrItalicMarker(false, token[0], token.length())));
785 prev = m[0].second;
786 }
787 else
788 {
789 std::string token = m[1], contents = m[2];
790 tgt.push_back(TokenPtr(new BoldOrItalicMarker(true, token[0], token.length())));
791 tgt.push_back(TokenPtr(new RawText(std::string(contents))));
792 tgt.push_back(
793 TokenPtr(new BoldOrItalicMarker(false, token[0], token.length())));
794 prev = m[0].second;
795 }
796 }
797 else
798 {
799 if (prev != end)
800 tgt.push_back(TokenPtr(new RawText(std::string(prev, end))));
801
802 break;
803 }
804 }
805
806 int id = 0;
807
808 for (TokenGroup::iterator ii = tgt.begin(), iie = tgt.end(); ii != iie; ++ii)
809 {
810 if ((*ii)->isUnmatchedOpenMarker())
811 {
812 BoldOrItalicMarker* openToken = dynamic_cast<BoldOrItalicMarker*>(ii->get());
813
814 // Find a matching close-marker, if it's there
815 TokenGroup::iterator iii = ii;
816
817 for (++iii; iii != iie; ++iii)
818 {
819 if ((*iii)->isUnmatchedCloseMarker())
820 {
821 BoldOrItalicMarker* closeToken =
822 dynamic_cast<BoldOrItalicMarker*>(iii->get());
823
824 if (closeToken->size() == 3 && openToken->size() != 3)
825 {
826 // Split the close-token into a match for the open-token
827 // and a second for the leftovers.
828 closeToken->disable();
829 TokenGroup g;
830 g.push_back(TokenPtr(
831 new BoldOrItalicMarker(false,
832 closeToken->tokenCharacter(),
833 closeToken->size() - openToken->size())));
834 g.push_back(TokenPtr(new BoldOrItalicMarker(
835 false, closeToken->tokenCharacter(), openToken->size())));
836 TokenGroup::iterator after = iii;
837 ++after;
838 tgt.splice(after, g);
839 continue;
840 }
841
842 if (closeToken->tokenCharacter() == openToken->tokenCharacter() &&
843 closeToken->size() == openToken->size())
844 {
845 openToken->matched(closeToken, id);
846 closeToken->matched(openToken, id);
847 ++id;
848 break;
849 }
850 else if (openToken->size() == 3)
851 {
852 // Split the open-token into a match for the close-token
853 // and a second for the leftovers.
854 openToken->disable();
855 TokenGroup g;
856 g.push_back(TokenPtr(
857 new BoldOrItalicMarker(true,
858 openToken->tokenCharacter(),
859 openToken->size() - closeToken->size())));
860 g.push_back(TokenPtr(new BoldOrItalicMarker(
861 true, openToken->tokenCharacter(), closeToken->size())));
862 TokenGroup::iterator after = ii;
863 ++after;
864 tgt.splice(after, g);
865 break;
866 }
867 }
868 }
869 }
870 }
871
872 // "Unmatch" invalidly-nested matches.
873 std::stack<BoldOrItalicMarker*> openMatches;
874
875 for (auto& ii : tgt)
876 {
877 if (ii->isMatchedOpenMarker())
878 {
879 BoldOrItalicMarker* open = dynamic_cast<BoldOrItalicMarker*>(ii.get());
880 openMatches.push(open);
881 }
882 else if (ii->isMatchedCloseMarker())
883 {
884 BoldOrItalicMarker* close = dynamic_cast<BoldOrItalicMarker*>(ii.get());
885
886 if (close->id() != openMatches.top()->id())
887 {
888 close->matchedTo()->matched(nullptr);
889 close->matched(nullptr);
890 }
891 else
892 {
893 openMatches.pop();
894
895 while (!openMatches.empty() && openMatches.top()->matchedTo() == nullptr)
896 {
897 openMatches.pop();
898 }
899 }
900 }
901 }
902
903 TokenGroup r;
904
905 for (auto& ii : tgt)
906 {
907 if (ii->text() && ii->canContainMarkup())
908 {
909 TokenGroup t = _encodeProcessedItems(*ii->text(), replacements);
910 r.splice(r.end(), t);
911 }
912 else
913 {
914 r.push_back(ii);
915 }
916 }
917
918 return r;
919 }
920
922 RawText::_encodeProcessedItems(const std::string& src, ReplacementTable& replacements)
923 {
924 static const boost::regex cReplaced("\x01@(#?[0-9]*)@.+?\x01");
925
926 TokenGroup r;
927 std::string::const_iterator prev = src.begin();
928
929 while (true)
930 {
931 boost::smatch m;
932
933 if (boost::regex_search(prev, src.end(), m, cReplaced))
934 {
935 std::string pre = std::string(prev, m[0].first);
936
937 if (!pre.empty())
938 {
939 r.push_back(TokenPtr(new RawText(pre)));
940 }
941
942 prev = m[0].second;
943
944 std::string ref = m[1];
945
946 if (ref[0] == '#')
947 {
948 size_t n = std::stoul(ref.substr(1));
949 r.push_back(TokenPtr(new EscapedCharacter(escapedCharacter(n))));
950 }
951 else if (!ref.empty())
952 {
953 size_t n = std::stoul(ref);
954
955 assert(n < replacements.size());
956 r.push_back(replacements[n]);
957 } // Otherwise just eat it
958 }
959 else
960 {
961 std::string pre = std::string(prev, src.end());
962
963 if (!pre.empty())
964 {
965 r.push_back(TokenPtr(new RawText(pre)));
966 }
967
968 break;
969 }
970 }
971
972 return r;
973 }
974
975 std::string
976 RawText::_restoreProcessedItems(const std::string& src, ReplacementTable& replacements)
977 {
978 static const boost::regex cReplaced("\x01@(#?[0-9]*)@.+?\x01");
979
980 std::ostringstream r;
981 std::string::const_iterator prev = src.begin();
982
983 while (true)
984 {
985 boost::smatch m;
986
987 if (boost::regex_search(prev, src.end(), m, cReplaced))
988 {
989 std::string pre = std::string(prev, m[0].first);
990
991 if (!pre.empty())
992 {
993 r << pre;
994 }
995
996 prev = m[0].second;
997
998 std::string ref = m[1];
999
1000 if (ref[0] == '#')
1001 {
1002 size_t n = std::stoul(ref.substr(1));
1003 r << '\\' << escapedCharacter(n);
1004 }
1005 else if (!ref.empty())
1006 {
1007 size_t n = std::stoul(ref);
1008
1009 assert(n < replacements.size());
1010 replacements[n]->writeAsOriginal(r);
1011 } // Otherwise just eat it
1012 }
1013 else
1014 {
1015 std::string pre = std::string(prev, src.end());
1016
1017 if (!pre.empty())
1018 {
1019 r << pre;
1020 }
1021
1022 break;
1023 }
1024 }
1025
1026 return r.str();
1027 }
1028
1029 HtmlAnchorTag::HtmlAnchorTag(const std::string& url, const std::string& title) :
1030 TextHolder("<a href=\"" + encodeString(url, cQuotes | cAmps) + "\"" +
1031 (title.empty() ? std::string()
1032 : " title=\"" + encodeString(title, cQuotes | cAmps) + "\"") +
1033 ">",
1034 false,
1035 0)
1036 {
1037 // This space deliberately blank. ;-)
1038 }
1039
1040 void
1041 CodeBlock::writeAsHtml(std::ostream& out) const
1042 {
1043 out << "<pre><code>";
1045 out << "</code></pre>\n\n";
1046 }
1047
1048 void
1049 CodeSpan::writeAsHtml(std::ostream& out) const
1050 {
1051 out << "<code>";
1053 out << "</code>";
1054 }
1055
1056 void
1057 CodeSpan::writeAsOriginal(std::ostream& out) const
1058 {
1059 out << '`' << *text() << '`';
1060 }
1061
1062 void
1063 Container::writeAsHtml(std::ostream& out) const
1064 {
1065 preWrite(out);
1066
1067 for (const auto& mSubToken : mSubTokens)
1068 {
1069 mSubToken->writeAsHtml(out);
1070 }
1071
1072 postWrite(out);
1073 }
1074
1075 void
1076 Container::writeToken(size_t indent, std::ostream& out) const
1077 {
1078 out << std::string(indent * 2, ' ') << containerName() << "\n";
1079
1080 for (const auto& mSubToken : mSubTokens)
1081 {
1082 mSubToken->writeToken(indent + 1, out);
1083 }
1084 }
1085
1086 std::optional<TokenGroup>
1088 {
1089 TokenGroup t;
1090
1091 for (CTokenGroupIter ii = mSubTokens.begin(), iie = mSubTokens.end(); ii != iie; ++ii)
1092 {
1093 if ((*ii)->text())
1094 {
1095 std::optional<TokenGroup> subt = (*ii)->processSpanElements(idTable);
1096
1097 if (subt)
1098 {
1099 if (subt->size() > 1)
1100 {
1101 t.push_back(TokenPtr(new Container(*subt)));
1102 }
1103 else if (!subt->empty())
1104 {
1105 t.push_back(*subt->begin());
1106 }
1107 }
1108 else
1109 {
1110 t.push_back(*ii);
1111 }
1112 }
1113 else
1114 {
1115 std::optional<TokenGroup> subt = (*ii)->processSpanElements(idTable);
1116
1117 if (subt)
1118 {
1119 const Container* c = dynamic_cast<const Container*>((*ii).get());
1120 assert(c != 0);
1121 t.push_back(c->clone(*subt));
1122 }
1123 else
1124 {
1125 t.push_back(*ii);
1126 }
1127 }
1128 }
1129
1130 swapSubtokens(t);
1131 return std::nullopt;
1132 }
1133
1134 UnorderedList::UnorderedList(const TokenGroup& contents, bool paragraphMode)
1135 {
1136 if (paragraphMode)
1137 {
1138 // Change each of the text items into paragraphs
1139 for (const auto& content : contents)
1140 {
1141 token::ListItem* item = dynamic_cast<token::ListItem*>(content.get());
1142 assert(item != 0);
1143 item->inhibitParagraphs(false);
1144 mSubTokens.push_back(content);
1145 }
1146 }
1147 else
1148 {
1149 mSubTokens = contents;
1150 }
1151 }
1152
1153 void
1154 BoldOrItalicMarker::writeAsHtml(std::ostream& out) const
1155 {
1156 if (!mDisabled)
1157 {
1158 if (mMatch != nullptr)
1159 {
1160 assert(mSize >= 1 && mSize <= 3);
1161
1162 if (mOpenMarker)
1163 {
1164 out << (mSize == 1 ? "<em>" : mSize == 2 ? "<strong>" : "<strong><em>");
1165 }
1166 else
1167 {
1168 out << (mSize == 1 ? "</em>" : mSize == 2 ? "</strong>" : "</em></strong>");
1169 }
1170 }
1171 else
1172 {
1173 out << std::string(mSize, mTokenCharacter);
1174 }
1175 }
1176 }
1177
1178 void
1179 BoldOrItalicMarker::writeToken(std::ostream& out) const
1180 {
1181 if (!mDisabled)
1182 {
1183 if (mMatch != nullptr)
1184 {
1185 std::string type = (mSize == 1 ? "italic" : mSize == 2 ? "bold" : "italic&bold");
1186
1187 if (mOpenMarker)
1188 {
1189 out << "Matched open-" << type << " marker\n";
1190 }
1191 else
1192 {
1193 out << "Matched close-" << type << " marker\n";
1194 }
1195 }
1196 else
1197 {
1198 if (mOpenMarker)
1199 out << "Unmatched bold/italic open marker: "
1200 << std::string(mSize, mTokenCharacter) << "\n";
1201 else
1202 out << "Unmatched bold/italic close marker: "
1203 << std::string(mSize, mTokenCharacter) << "\n";
1204 }
1205 }
1206 }
1207
1208 void
1209 Image::writeAsHtml(std::ostream& out) const
1210 {
1211 out << "<img src=\"" << mUrl << "\" alt=\"" << mAltText << "\"";
1212
1213 if (!mTitle.empty())
1214 {
1215 out << " title=\"" << mTitle << "\"";
1216 }
1217
1218 out << "/>";
1219 }
1220
1221} // namespace markdown::token
uint8_t index
if(!yyvaluep)
Definition Grammar.cpp:645
constexpr T c
std::string str(const T &t)
virtual void postWrite(std::ostream &out) const
virtual void preWrite(std::ostream &out) const
void writeAsHtml(std::ostream &out) const override
void writeToken(std::ostream &out) const override
void writeAsHtml(std::ostream &out) const override
void writeAsHtml(std::ostream &out) const override
void writeAsOriginal(std::ostream &out) const override
void writeAsHtml(std::ostream &out) const override
void swapSubtokens(TokenGroup &tokens)
std::optional< TokenGroup > processSpanElements(const LinkIds &idTable) override
void writeToken(std::ostream &out) const override
Container(const TokenGroup &contents=TokenGroup())
virtual std::string containerName() const
HtmlAnchorTag(const std::string &url, const std::string &title=std::string())
void writeAsHtml(std::ostream &out) const override
RawText(const std::string &text, bool canContainMarkup=true)
std::optional< TokenGroup > processSpanElements(const LinkIds &idTable) override
void writeAsHtml(std::ostream &out) const override
TextHolder(const std::string &text, bool canContainMarkup, unsigned int encodingFlags)
bool canContainMarkup() const override
std::optional< std::string > text() const override
UnorderedList(const TokenGroup &contents, bool paragraphMode=false)
This file offers overloads of toIce() and fromIce() functions for STL container types.
Vertex target(const detail::edge_base< Directed, Vertex > &e, const PCG &)
constexpr auto n() noexcept
size_t isValidTag(const std::string &tag, bool nonBlockFirst)
std::shared_ptr< Token > TokenPtr
Definition markdown.h:21
std::list< TokenPtr > TokenGroup
Definition markdown.h:22
TokenGroup::const_iterator CTokenGroupIter