10 #include <boost/regex.hpp>
22 const std::string cEscapedCharacters(
"\\`*_{}[]()#+-.!>");
24 std::optional<size_t> isEscapedCharacter(
char c)
26 std::string::const_iterator i = std::find(cEscapedCharacters.begin(),
27 cEscapedCharacters.end(),
c);
29 if (i != cEscapedCharacters.end())
39 char escapedCharacter(
size_t index)
41 return cEscapedCharacters[
index];
44 std::string encodeString(
const std::string& src,
int encodingFlags)
46 bool amps = (encodingFlags &
cAmps) != 0,
48 angleBrackets = (encodingFlags &
cAngles) != 0,
49 quotes = (encodingFlags &
cQuotes) != 0;
53 for (std::string::const_iterator i = src.begin(), ie = src.end(); i != ie; ++i)
55 if (*i ==
'&' && amps)
57 static const boost::regex cIgnore(
"^(&)|(&#[0-9]{1,3};)|(&#[xX][0-9a-fA-F]{1,2};)");
59 if (boost::regex_search(i, ie, cIgnore))
68 else if (*i ==
'&' && doubleAmps)
72 else if (*i ==
'<' && angleBrackets)
76 else if (*i ==
'>' && angleBrackets)
80 else if (*i ==
'\"' && quotes)
93 bool looksLikeUrl(
const std::string&
str)
95 const char* schemes[] = {
"http://",
"https://",
"ftp://",
"ftps://",
96 "file://",
"www.",
"ftp.",
nullptr
99 for (
size_t x = 0; schemes[x] !=
nullptr; ++x)
101 const char*
s =
str.c_str(), *t = schemes[x];
103 while (*
s != 0 && *t != 0 && *
s == *t)
118 bool notValidNameCharacter(
char c)
120 return !(isalnum(
c) ||
c ==
'.' ||
c ==
'_' ||
c ==
'%' ||
c ==
'-' ||
c ==
'+');
123 bool notValidSiteCharacter(
char c)
129 return !(isalnum(
c) ||
c ==
'.' ||
c ==
'_' ||
c ==
'%' ||
c ==
'-' || (
c & 0x80));
132 bool isNotAlpha(
char c)
137 std::string emailEncode(
const std::string& src)
139 std::ostringstream out;
150 out <<
"&#x" << std::hex << static_cast<int>(i) <<
';';
154 out <<
"&#" << std::dec << static_cast<int>(i) <<
';';
163 bool looksLikeEmailAddress(
const std::string&
str)
165 using Iter = std::string::const_iterator;
166 using RIter = std::string::const_reverse_iterator;
167 Iter i = std::find_if(
str.begin(),
str.end(), notValidNameCharacter);
169 if (i !=
str.end() && *i ==
'@' && i !=
str.begin())
172 i = std::find_if(i + 1,
str.end(), notValidSiteCharacter);
177 RIter ri = std::find_if(
str.rbegin(),
str.rend(), isNotAlpha);
179 if (ri !=
str.rend() && *ri ==
'.')
184 if (d >= 2 && d <= 4)
198 const char* cOtherTagInit[] =
201 "title/",
"base",
"link",
"basefont",
"script/",
"style/",
205 "em/",
"strong/",
"q/",
"cite/",
"dfn/",
"abbr/",
"acronym/",
206 "code/",
"samp/",
"kbd/",
"var/",
"sub/",
"sup/",
"del/",
"ins/",
207 "isindex",
"a/",
"img",
"br",
"map/",
"area",
"object/",
"param",
213 const char* cBlockTagInit[] = {
"p/",
"blockquote/",
"hr",
"h1/",
"h2/",
214 "h3/",
"h4/",
"h5/",
"h6/",
"dl/",
"dt/",
"dd/",
"ol/",
"ul/",
215 "li/",
"dir/",
"menu/",
"table/",
"tr/",
"th/",
"td/",
"col",
216 "colgroup/",
"caption/",
"thead/",
"tbody/",
"tfoot/",
"form/",
217 "select/",
"option",
"input",
"label/",
"textarea/",
"div/",
"pre/",
218 "address/",
"iframe/",
"frame/",
"frameset/",
"noframes/",
219 "center/",
"b/",
"i/",
"big/",
"small/",
"strike/",
"tt/",
220 "u/",
"font/",
"ins/",
"del/",
nullptr
227 std::set<std::string> otherTags, blockTags;
229 void initTag(std::set<std::string>&
set,
const char* init[])
231 for (
size_t x = 0; init[x] !=
nullptr; ++x)
233 std::string
str = init[x];
235 if (*
str.rbegin() ==
'/')
245 std::string cleanTextLinkRef(
const std::string& ref)
253 if (r.empty() || *r.rbegin() !=
' ')
271 size_t isValidTag(
const std::string& tag,
bool nonBlockFirst)
273 if (blockTags.empty())
275 initTag(otherTags, cOtherTagInit);
276 initTag(blockTags, cBlockTagInit);
281 if (otherTags.find(tag) != otherTags.end())
286 if (blockTags.find(tag) != blockTags.end())
293 if (blockTags.find(tag) != blockTags.end())
298 if (otherTags.find(tag) != otherTags.end())
313 if (mEncodingFlags != 0)
315 out << encodeString(mText, mEncodingFlags);
332 ReplacementTable replacements;
333 std::string
str = _processHtmlTagAttributes(*
text(), replacements);
334 str = _processCodeSpans(
str, replacements);
335 str = _processEscapedCharacters(
str);
336 str = _processLinksImagesAndTags(
str, replacements, idTable);
337 return _processBoldAndItalicSpans(
str, replacements);
340 std::string RawText::_processHtmlTagAttributes(std::string src, ReplacementTable&
345 std::string::const_iterator prev = src.begin(), end = src.end();
349 static const boost::regex cHtmlToken(
"<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))+? */? *))>");
352 if (boost::regex_search(prev, end, m, cHtmlToken))
359 tgt += std::string(prev, m[0].first);
361 std::string fulltag = m[0], tgttag;
362 std::string::const_iterator prevtag = fulltag.begin(), endtag = fulltag.end();
366 static const boost::regex cAttributeStrings(
"= ?(\"|').*?\\1");
369 if (boost::regex_search(prevtag, endtag, mtag, cAttributeStrings))
371 tgttag += std::string(prevtag, mtag[0].first);
372 tgttag +=
"\x01@" +
std::to_string(replacements.size()) +
"@htmlTagAttr\x01";
373 prevtag = mtag[0].second;
379 tgttag += std::string(prevtag, endtag);
389 tgt += std::string(prev, m[0].second);
395 tgt += std::string(prev, end);
403 std::string RawText::_processCodeSpans(std::string src, ReplacementTable&
406 static const boost::regex cCodeSpan[2] =
408 boost::regex(
"(?:^|(?<=[^\\\\]))`` (.+?) ``"),
409 boost::regex(
"(?:^|(?<=[^\\\\]))`(.+?)`")
412 for (
const auto& pass : cCodeSpan)
415 std::string::const_iterator prev = src.begin(), end = src.end();
421 if (boost::regex_search(prev, end, m, pass))
423 tgt += std::string(prev, m[0].first);
424 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@codeSpan\x01";
426 replacements.push_back(
TokenPtr(
new CodeSpan(_restoreProcessedItems(m[1], replacements))));
430 tgt += std::string(prev, end);
442 std::string RawText::_processEscapedCharacters(
const std::string& src)
445 std::string::const_iterator prev = src.begin(), end = src.end();
449 std::string::const_iterator i = std::find(prev, end,
'\\');
453 tgt += std::string(prev, i);
458 std::optional<size_t> e = isEscapedCharacter(*i);
466 tgt = tgt +
'\\' + *i;
479 tgt += std::string(prev, end);
487 std::string RawText::_processSpaceBracketedGroupings(
const std::string& src,
488 ReplacementTable& replacements)
490 static const boost::regex cRemove(
"(?:(?: \\*+ )|(?: _+ ))");
493 std::string::const_iterator prev = src.begin(), end = src.end();
499 if (boost::regex_search(prev, end, m, cRemove))
501 tgt += std::string(prev, m[0].first);
502 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@spaceBracketed\x01";
508 tgt += std::string(prev, end);
516 std::string RawText::_processLinksImagesAndTags(
const std::string& src,
517 ReplacementTable& replacements,
const LinkIds& idTable)
535 static const boost::regex cExpression(
536 "(?:(!?)\\[([^\\]]+?)\\] *\\(([^\\(]*(?:\\(.*?\\).*?)*?)\\))"
537 "|(?:(!?)\\[((?:[^]]*?\\[.*?\\].*?)|(?:.+?))\\](?: *\\[(.*?)\\])?)"
538 "|(?:<(/?([a-zA-Z0-9]+).*?)>)"
545 std::string::const_iterator prev = src.begin(), end = src.end();
551 if (boost::regex_search(prev, end, m, cExpression))
553 assert(m[0].matched);
554 assert(m[0].length() != 0);
556 tgt += std::string(prev, m[0].first);
557 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@links&Images1\x01";
560 bool isImage =
false, isLink =
false, isReference =
false;
562 if (m[4].matched && m[4].length())
564 isImage = isReference =
true;
566 else if (m[1].matched && m[1].length())
570 else if (m[5].matched)
572 isLink = isReference =
true;
574 else if (m[2].matched)
579 if (isImage || isLink)
581 std::string contentsOrAlttext, url, title;
582 bool resolved =
false;
586 contentsOrAlttext = m[5];
587 std::string linkId = (m[6].matched ? std::string(m[6]) :
std::string());
591 linkId = cleanTextLinkRef(contentsOrAlttext);
594 std::optional<markdown::LinkIds::Target>
target = idTable.find(linkId);
605 static const boost::regex cReference(
"^<?([^ >]*)>?(?: *(?:('|\")(.*)\\2)|(?:\\((.*)\\)))? *$");
607 contentsOrAlttext = m[2];
608 std::string urlAndTitle = m[3];
611 if (boost::regex_match(urlAndTitle, mm, cReference))
619 else if (mm[4].matched)
632 prev = m[0].first + 1;
633 replacements.push_back(
TokenPtr(
new RawText(std::string(m[0].first, prev))));
637 replacements.push_back(
TokenPtr(
new Image(contentsOrAlttext,
642 replacements.push_back(
TokenPtr(
new HtmlAnchorTag(url, title)));
643 tgt += contentsOrAlttext;
644 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@links&Images2\x01";
645 replacements.push_back(
TokenPtr(
new HtmlTag(
"/a")));
651 std::string contents = m[7];
656 if (looksLikeUrl(contents))
659 subgroup.push_back(
TokenPtr(
new HtmlAnchorTag(contents)));
661 subgroup.push_back(
TokenPtr(
new HtmlTag(
"/a")));
662 replacements.push_back(
TokenPtr(
new Container(subgroup)));
664 else if (looksLikeEmailAddress(contents))
667 subgroup.push_back(
TokenPtr(
new HtmlAnchorTag(emailEncode(
"mailto:" + contents))));
668 subgroup.push_back(
TokenPtr(
new RawText(emailEncode(contents),
false)));
669 subgroup.push_back(
TokenPtr(
new HtmlTag(
"/a")));
670 replacements.push_back(
TokenPtr(
new Container(subgroup)));
674 replacements.push_back(
TokenPtr(
new HtmlTag(_restoreProcessedItems(contents, replacements))));
685 tgt += std::string(prev, end);
693 TokenGroup RawText::_processBoldAndItalicSpans(
const std::string& src,
694 ReplacementTable& replacements)
696 static const boost::regex cEmphasisExpression(
697 "(?:(?<![*_])([*_]{1,3})([^*_ ]+?)\\1(?![*_]))"
698 "|((?:(?<!\\*)\\*{1,3}(?!\\*)|(?<!_)_{1,3}(?!_))(?=.)(?! )(?![.,:;] )(?![.,:;]$))"
699 "|((?<![* ])\\*{1,3}(?!\\*)|(?<![ _])_{1,3}(?!_))"
703 std::string::const_iterator i = src.begin(), end = src.end(), prev = i;
709 if (boost::regex_search(prev, end, m, cEmphasisExpression))
711 if (prev != m[0].first) tgt.push_back(
TokenPtr(
new
712 RawText(std::string(prev, m[0].first))));
716 std::string token = m[3];
717 tgt.push_back(
TokenPtr(
new BoldOrItalicMarker(
true, token[0],
721 else if (m[4].matched)
723 std::string token = m[4];
724 tgt.push_back(
TokenPtr(
new BoldOrItalicMarker(
false, token[0],
730 std::string token = m[1], contents = m[2];
731 tgt.push_back(
TokenPtr(
new BoldOrItalicMarker(
true, token[0],
734 tgt.push_back(
TokenPtr(
new BoldOrItalicMarker(
false, token[0],
750 for (TokenGroup::iterator ii = tgt.begin(), iie = tgt.end(); ii != iie; ++ii)
752 if ((*ii)->isUnmatchedOpenMarker())
754 BoldOrItalicMarker* openToken =
dynamic_cast<BoldOrItalicMarker*
>(ii->get());
757 TokenGroup::iterator iii = ii;
759 for (++iii; iii != iie; ++iii)
761 if ((*iii)->isUnmatchedCloseMarker())
763 BoldOrItalicMarker* closeToken =
dynamic_cast<BoldOrItalicMarker*
>(iii->get());
765 if (closeToken->size() == 3 && openToken->size() != 3)
769 closeToken->disable();
771 g.push_back(
TokenPtr(
new BoldOrItalicMarker(
false,
772 closeToken->tokenCharacter(), closeToken->size() -
773 openToken->size())));
774 g.push_back(
TokenPtr(
new BoldOrItalicMarker(
false,
775 closeToken->tokenCharacter(), openToken->size())));
776 TokenGroup::iterator after = iii;
778 tgt.splice(after, g);
782 if (closeToken->tokenCharacter() == openToken->tokenCharacter()
783 && closeToken->size() == openToken->size())
785 openToken->matched(closeToken,
id);
786 closeToken->matched(openToken,
id);
790 else if (openToken->size() == 3)
794 openToken->disable();
796 g.push_back(
TokenPtr(
new BoldOrItalicMarker(
true,
797 openToken->tokenCharacter(), openToken->size() -
798 closeToken->size())));
799 g.push_back(
TokenPtr(
new BoldOrItalicMarker(
true,
800 openToken->tokenCharacter(), closeToken->size())));
801 TokenGroup::iterator after = ii;
803 tgt.splice(after, g);
812 std::stack<BoldOrItalicMarker*> openMatches;
816 if (ii->isMatchedOpenMarker())
818 BoldOrItalicMarker* open =
dynamic_cast<BoldOrItalicMarker*
>(ii.get());
819 openMatches.push(open);
821 else if (ii->isMatchedCloseMarker())
823 BoldOrItalicMarker* close =
dynamic_cast<BoldOrItalicMarker*
>(ii.get());
825 if (close->id() != openMatches.top()->id())
827 close->matchedTo()->matched(
nullptr);
828 close->matched(
nullptr);
834 while (!openMatches.empty() && openMatches.top()->matchedTo() ==
nullptr)
846 if (ii->text() && ii->canContainMarkup())
848 TokenGroup t = _encodeProcessedItems(*ii->text(), replacements);
849 r.splice(r.end(), t);
860 TokenGroup RawText::_encodeProcessedItems(
const std::string& src,
861 ReplacementTable& replacements)
863 static const boost::regex cReplaced(
"\x01@(#?[0-9]*)@.+?\x01");
866 std::string::const_iterator prev = src.begin();
872 if (boost::regex_search(prev, src.end(), m, cReplaced))
874 std::string pre = std::string(prev, m[0].first);
883 std::string ref = m[1];
887 size_t n = std::stoul(ref.substr(1));
888 r.push_back(
TokenPtr(
new EscapedCharacter(escapedCharacter(n))));
890 else if (!ref.empty())
892 size_t n = std::stoul(ref);
894 assert(n < replacements.size());
895 r.push_back(replacements[n]);
900 std::string pre = std::string(prev, src.end());
914 std::string RawText::_restoreProcessedItems(
const std::string& src,
915 ReplacementTable& replacements)
917 static const boost::regex cReplaced(
"\x01@(#?[0-9]*)@.+?\x01");
919 std::ostringstream r;
920 std::string::const_iterator prev = src.begin();
926 if (boost::regex_search(prev, src.end(), m, cReplaced))
928 std::string pre = std::string(prev, m[0].first);
937 std::string ref = m[1];
941 size_t n = std::stoul(ref.substr(1));
942 r <<
'\\' << escapedCharacter(n);
944 else if (!ref.empty())
946 size_t n = std::stoul(ref);
948 assert(n < replacements.size());
949 replacements[n]->writeAsOriginal(r);
954 std::string pre = std::string(prev, src.end());
978 out <<
"<pre><code>";
980 out <<
"</code></pre>\n\n";
992 out <<
'`' << *
text() <<
'`';
1003 mSubToken->writeAsHtml(out);
1011 out << std::string(indent * 2,
' ') <<
containerName() <<
"\n";
1015 mSubToken->writeToken(indent + 1, out);
1028 std::optional<TokenGroup> subt = (*ii)->processSpanElements(idTable);
1032 if (subt->size() > 1)
1036 else if (!subt->empty())
1038 t.push_back(*subt->begin());
1048 std::optional<TokenGroup> subt = (*ii)->processSpanElements(idTable);
1054 t.push_back(
c->clone(*subt));
1064 return std::nullopt;
1072 for (
const auto& content : contents)
1092 if (mMatch !=
nullptr)
1094 assert(mSize >= 1 && mSize <= 3);
1098 out << (mSize == 1 ?
"<em>" : mSize == 2 ?
"<strong>" :
"<strong><em>");
1102 out << (mSize == 1 ?
"</em>" : mSize == 2 ?
"</strong>" :
"</em></strong>");
1107 out << std::string(mSize, mTokenCharacter);
1116 if (mMatch !=
nullptr)
1118 std::string type = (mSize == 1 ?
"italic" : mSize == 2 ?
"bold" :
"italic&bold");
1122 out <<
"Matched open-" << type <<
" marker\n";
1126 out <<
"Matched close-" << type <<
" marker\n";
1131 if (mOpenMarker) out <<
"Unmatched bold/italic open marker: " <<
1132 std::string(mSize, mTokenCharacter) <<
"\n";
1133 else out <<
"Unmatched bold/italic close marker: " <<
1134 std::string(mSize, mTokenCharacter) <<
"\n";
1141 out <<
"<img src=\"" << mUrl <<
"\" alt=\"" << mAltText <<
"\"";
1143 if (!mTitle.empty())
1145 out <<
" title=\"" << mTitle <<
"\"";