14 #include <boost/regex.hpp>
22 const std::string cEscapedCharacters(
"\\`*_{}[]()#+-.!>");
25 isEscapedCharacter(
char c)
27 std::string::const_iterator i =
28 std::find(cEscapedCharacters.begin(), cEscapedCharacters.end(),
c);
30 if (i != cEscapedCharacters.end())
41 escapedCharacter(
size_t index)
43 return cEscapedCharacters[
index];
47 encodeString(
const std::string& src,
int encodingFlags)
49 bool amps = (encodingFlags &
cAmps) != 0,
51 angleBrackets = (encodingFlags &
cAngles) != 0,
52 quotes = (encodingFlags &
cQuotes) != 0;
56 for (std::string::const_iterator i = src.begin(), ie = src.end(); i != ie; ++i)
58 if (*i ==
'&' && amps)
60 static const boost::regex cIgnore(
61 "^(&)|(&#[0-9]{1,3};)|(&#[xX][0-9a-fA-F]{1,2};)");
63 if (boost::regex_search(i, ie, cIgnore))
72 else if (*i ==
'&' && doubleAmps)
76 else if (*i ==
'<' && angleBrackets)
80 else if (*i ==
'>' && angleBrackets)
84 else if (*i ==
'\"' && quotes)
98 looksLikeUrl(
const std::string&
str)
100 const char* schemes[] = {
101 "http://",
"https://",
"ftp://",
"ftps://",
"file://",
"www.",
"ftp.",
nullptr};
103 for (
size_t x = 0; schemes[x] !=
nullptr; ++x)
105 const char *
s =
str.c_str(), *t = schemes[x];
107 while (*
s != 0 && *t != 0 && *
s == *t)
123 notValidNameCharacter(
char c)
125 return !(isalnum(
c) ||
c ==
'.' ||
c ==
'_' ||
c ==
'%' ||
c ==
'-' ||
c ==
'+');
129 notValidSiteCharacter(
char c)
135 return !(isalnum(
c) ||
c ==
'.' ||
c ==
'_' ||
c ==
'%' ||
c ==
'-' || (
c & 0x80));
145 emailEncode(
const std::string& src)
147 std::ostringstream out;
158 out <<
"&#x" << std::hex << static_cast<int>(i) <<
';';
162 out <<
"&#" << std::dec << static_cast<int>(i) <<
';';
172 looksLikeEmailAddress(
const std::string&
str)
174 using Iter = std::string::const_iterator;
175 using RIter = std::string::const_reverse_iterator;
176 Iter i = std::find_if(
str.begin(),
str.end(), notValidNameCharacter);
178 if (i !=
str.end() && *i ==
'@' && i !=
str.begin())
181 i = std::find_if(i + 1,
str.end(), notValidSiteCharacter);
186 RIter ri = std::find_if(
str.rbegin(),
str.rend(), isNotAlpha);
188 if (ri !=
str.rend() && *ri ==
'.')
193 if (d >= 2 && d <= 4)
207 const char* cOtherTagInit[] = {
247 const char* cBlockTagInit[] = {
"p/",
"blockquote/",
258 "colgroup/",
"caption/",
266 "frameset/",
"noframes/",
278 std::set<std::string> otherTags, blockTags;
281 initTag(std::set<std::string>&
set,
const char* init[])
283 for (
size_t x = 0; init[x] !=
nullptr; ++x)
285 std::string
str = init[x];
287 if (*
str.rbegin() ==
'/')
298 cleanTextLinkRef(
const std::string& ref)
306 if (r.empty() || *r.rbegin() !=
' ')
325 if (blockTags.empty())
327 initTag(otherTags, cOtherTagInit);
328 initTag(blockTags, cBlockTagInit);
333 if (otherTags.find(tag) != otherTags.end())
338 if (blockTags.find(tag) != blockTags.end())
345 if (blockTags.find(tag) != blockTags.end())
350 if (otherTags.find(tag) != otherTags.end())
364 if (mEncodingFlags != 0)
366 out << encodeString(mText, mEncodingFlags);
376 std::optional<TokenGroup>
384 ReplacementTable replacements;
385 std::string
str = _processHtmlTagAttributes(*
text(), replacements);
386 str = _processCodeSpans(
str, replacements);
387 str = _processEscapedCharacters(
str);
388 str = _processLinksImagesAndTags(
str, replacements, idTable);
389 return _processBoldAndItalicSpans(
str, replacements);
393 RawText::_processHtmlTagAttributes(std::string src, ReplacementTable& replacements)
397 std::string::const_iterator prev = src.begin(), end = src.end();
401 static const boost::regex cHtmlToken(
402 "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))+? */? *))>");
405 if (boost::regex_search(prev, end, m, cHtmlToken))
412 tgt += std::string(prev, m[0].first);
414 std::string fulltag = m[0], tgttag;
415 std::string::const_iterator prevtag = fulltag.begin(), endtag = fulltag.end();
419 static const boost::regex cAttributeStrings(
"= ?(\"|').*?\\1");
422 if (boost::regex_search(prevtag, endtag, mtag, cAttributeStrings))
424 tgttag += std::string(prevtag, mtag[0].first);
426 "\x01@" +
std::to_string(replacements.size()) +
"@htmlTagAttr\x01";
427 prevtag = mtag[0].second;
434 tgttag += std::string(prevtag, endtag);
444 tgt += std::string(prev, m[0].second);
450 tgt += std::string(prev, end);
459 RawText::_processCodeSpans(std::string src, ReplacementTable& replacements)
461 static const boost::regex cCodeSpan[2] = {boost::regex(
"(?:^|(?<=[^\\\\]))`` (.+?) ``"),
462 boost::regex(
"(?:^|(?<=[^\\\\]))`(.+?)`")};
464 for (
const auto& pass : cCodeSpan)
467 std::string::const_iterator prev = src.begin(), end = src.end();
473 if (boost::regex_search(prev, end, m, pass))
475 tgt += std::string(prev, m[0].first);
476 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@codeSpan\x01";
478 replacements.push_back(
479 TokenPtr(
new CodeSpan(_restoreProcessedItems(m[1], replacements))));
483 tgt += std::string(prev, end);
496 RawText::_processEscapedCharacters(
const std::string& src)
499 std::string::const_iterator prev = src.begin(), end = src.end();
503 std::string::const_iterator i = std::find(prev, end,
'\\');
507 tgt += std::string(prev, i);
512 std::optional<size_t> e = isEscapedCharacter(*i);
520 tgt = tgt +
'\\' + *i;
533 tgt += std::string(prev, end);
542 RawText::_processSpaceBracketedGroupings(
const std::string& src, ReplacementTable& replacements)
544 static const boost::regex cRemove(
"(?:(?: \\*+ )|(?: _+ ))");
547 std::string::const_iterator prev = src.begin(), end = src.end();
553 if (boost::regex_search(prev, end, m, cRemove))
555 tgt += std::string(prev, m[0].first);
556 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@spaceBracketed\x01";
562 tgt += std::string(prev, end);
571 RawText::_processLinksImagesAndTags(
const std::string& src,
572 ReplacementTable& replacements,
573 const LinkIds& idTable)
591 static const boost::regex cExpression(
592 "(?:(!?)\\[([^\\]]+?)\\] *\\(([^\\(]*(?:\\(.*?\\).*?)*?)\\))"
593 "|(?:(!?)\\[((?:[^]]*?\\[.*?\\].*?)|(?:.+?))\\](?: *\\[(.*?)\\])?)"
594 "|(?:<(/?([a-zA-Z0-9]+).*?)>)"
601 std::string::const_iterator prev = src.begin(), end = src.end();
607 if (boost::regex_search(prev, end, m, cExpression))
609 assert(m[0].matched);
610 assert(m[0].length() != 0);
612 tgt += std::string(prev, m[0].first);
613 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@links&Images1\x01";
616 bool isImage =
false, isLink =
false, isReference =
false;
618 if (m[4].matched && m[4].length())
620 isImage = isReference =
true;
622 else if (m[1].matched && m[1].length())
626 else if (m[5].matched)
628 isLink = isReference =
true;
630 else if (m[2].matched)
635 if (isImage || isLink)
637 std::string contentsOrAlttext, url, title;
638 bool resolved =
false;
642 contentsOrAlttext = m[5];
643 std::string linkId = (m[6].matched ? std::string(m[6]) :
std::string());
647 linkId = cleanTextLinkRef(contentsOrAlttext);
650 std::optional<markdown::LinkIds::Target>
target = idTable.find(linkId);
661 static const boost::regex cReference(
662 "^<?([^ >]*)>?(?: *(?:('|\")(.*)\\2)|(?:\\((.*)\\)))? *$");
664 contentsOrAlttext = m[2];
665 std::string urlAndTitle = m[3];
668 if (boost::regex_match(urlAndTitle, mm, cReference))
676 else if (mm[4].matched)
689 prev = m[0].first + 1;
690 replacements.push_back(
695 replacements.push_back(
TokenPtr(
new Image(contentsOrAlttext, url, title)));
699 replacements.push_back(
TokenPtr(
new HtmlAnchorTag(url, title)));
700 tgt += contentsOrAlttext;
701 tgt +=
"\x01@" +
std::to_string(replacements.size()) +
"@links&Images2\x01";
702 replacements.push_back(
TokenPtr(
new HtmlTag(
"/a")));
708 std::string contents = m[7];
713 if (looksLikeUrl(contents))
716 subgroup.push_back(
TokenPtr(
new HtmlAnchorTag(contents)));
718 subgroup.push_back(
TokenPtr(
new HtmlTag(
"/a")));
719 replacements.push_back(
TokenPtr(
new Container(subgroup)));
721 else if (looksLikeEmailAddress(contents))
725 TokenPtr(
new HtmlAnchorTag(emailEncode(
"mailto:" + contents))));
726 subgroup.push_back(
TokenPtr(
new RawText(emailEncode(contents),
false)));
727 subgroup.push_back(
TokenPtr(
new HtmlTag(
"/a")));
728 replacements.push_back(
TokenPtr(
new Container(subgroup)));
732 replacements.push_back(
733 TokenPtr(
new HtmlTag(_restoreProcessedItems(contents, replacements))));
744 tgt += std::string(prev, end);
753 RawText::_processBoldAndItalicSpans(
const std::string& src, ReplacementTable& replacements)
755 static const boost::regex cEmphasisExpression(
756 "(?:(?<![*_])([*_]{1,3})([^*_ ]+?)\\1(?![*_]))"
757 "|((?:(?<!\\*)\\*{1,3}(?!\\*)|(?<!_)_{1,3}(?!_))(?=.)(?! "
758 ")(?![.,:;] )(?![.,:;]$))"
759 "|((?<![* ])\\*{1,3}(?!\\*)|(?<![ _])_{1,3}(?!_))"
763 std::string::const_iterator i = src.begin(), end = src.end(), prev = i;
769 if (boost::regex_search(prev, end, m, cEmphasisExpression))
771 if (prev != m[0].first)
776 std::string token = m[3];
777 tgt.push_back(
TokenPtr(
new BoldOrItalicMarker(
true, token[0], token.length())));
780 else if (m[4].matched)
782 std::string token = m[4];
784 TokenPtr(
new BoldOrItalicMarker(
false, token[0], token.length())));
789 std::string token = m[1], contents = m[2];
790 tgt.push_back(
TokenPtr(
new BoldOrItalicMarker(
true, token[0], token.length())));
793 TokenPtr(
new BoldOrItalicMarker(
false, token[0], token.length())));
808 for (TokenGroup::iterator ii = tgt.begin(), iie = tgt.end(); ii != iie; ++ii)
810 if ((*ii)->isUnmatchedOpenMarker())
812 BoldOrItalicMarker* openToken =
dynamic_cast<BoldOrItalicMarker*
>(ii->get());
815 TokenGroup::iterator iii = ii;
817 for (++iii; iii != iie; ++iii)
819 if ((*iii)->isUnmatchedCloseMarker())
821 BoldOrItalicMarker* closeToken =
822 dynamic_cast<BoldOrItalicMarker*
>(iii->get());
824 if (closeToken->size() == 3 && openToken->size() != 3)
828 closeToken->disable();
831 new BoldOrItalicMarker(
false,
832 closeToken->tokenCharacter(),
833 closeToken->size() - openToken->size())));
834 g.push_back(
TokenPtr(
new BoldOrItalicMarker(
835 false, closeToken->tokenCharacter(), openToken->size())));
836 TokenGroup::iterator after = iii;
838 tgt.splice(after, g);
842 if (closeToken->tokenCharacter() == openToken->tokenCharacter() &&
843 closeToken->size() == openToken->size())
845 openToken->matched(closeToken,
id);
846 closeToken->matched(openToken,
id);
850 else if (openToken->size() == 3)
854 openToken->disable();
857 new BoldOrItalicMarker(
true,
858 openToken->tokenCharacter(),
859 openToken->size() - closeToken->size())));
860 g.push_back(
TokenPtr(
new BoldOrItalicMarker(
861 true, openToken->tokenCharacter(), closeToken->size())));
862 TokenGroup::iterator after = ii;
864 tgt.splice(after, g);
873 std::stack<BoldOrItalicMarker*> openMatches;
877 if (ii->isMatchedOpenMarker())
879 BoldOrItalicMarker* open =
dynamic_cast<BoldOrItalicMarker*
>(ii.get());
880 openMatches.push(open);
882 else if (ii->isMatchedCloseMarker())
884 BoldOrItalicMarker* close =
dynamic_cast<BoldOrItalicMarker*
>(ii.get());
886 if (close->id() != openMatches.top()->id())
888 close->matchedTo()->matched(
nullptr);
889 close->matched(
nullptr);
895 while (!openMatches.empty() && openMatches.top()->matchedTo() ==
nullptr)
907 if (ii->text() && ii->canContainMarkup())
909 TokenGroup t = _encodeProcessedItems(*ii->text(), replacements);
910 r.splice(r.end(), t);
922 RawText::_encodeProcessedItems(
const std::string& src, ReplacementTable& replacements)
924 static const boost::regex cReplaced(
"\x01@(#?[0-9]*)@.+?\x01");
927 std::string::const_iterator prev = src.begin();
933 if (boost::regex_search(prev, src.end(), m, cReplaced))
935 std::string pre = std::string(prev, m[0].first);
944 std::string ref = m[1];
948 size_t n = std::stoul(ref.substr(1));
949 r.push_back(
TokenPtr(
new EscapedCharacter(escapedCharacter(n))));
951 else if (!ref.empty())
953 size_t n = std::stoul(ref);
955 assert(n < replacements.size());
956 r.push_back(replacements[n]);
961 std::string pre = std::string(prev, src.end());
976 RawText::_restoreProcessedItems(
const std::string& src, ReplacementTable& replacements)
978 static const boost::regex cReplaced(
"\x01@(#?[0-9]*)@.+?\x01");
980 std::ostringstream r;
981 std::string::const_iterator prev = src.begin();
987 if (boost::regex_search(prev, src.end(), m, cReplaced))
989 std::string pre = std::string(prev, m[0].first);
998 std::string ref = m[1];
1002 size_t n = std::stoul(ref.substr(1));
1003 r <<
'\\' << escapedCharacter(n);
1005 else if (!ref.empty())
1007 size_t n = std::stoul(ref);
1009 assert(n < replacements.size());
1010 replacements[n]->writeAsOriginal(r);
1015 std::string pre = std::string(prev, src.end());
1032 :
" title=\"" + encodeString(title,
cQuotes |
cAmps) +
"\"") +
1043 out <<
"<pre><code>";
1045 out <<
"</code></pre>\n\n";
1059 out <<
'`' << *
text() <<
'`';
1069 mSubToken->writeAsHtml(out);
1078 out << std::string(indent * 2,
' ') <<
containerName() <<
"\n";
1082 mSubToken->writeToken(indent + 1, out);
1086 std::optional<TokenGroup>
1095 std::optional<TokenGroup> subt = (*ii)->processSpanElements(idTable);
1099 if (subt->size() > 1)
1103 else if (!subt->empty())
1105 t.push_back(*subt->begin());
1115 std::optional<TokenGroup> subt = (*ii)->processSpanElements(idTable);
1121 t.push_back(
c->clone(*subt));
1131 return std::nullopt;
1139 for (
const auto& content : contents)
1158 if (mMatch !=
nullptr)
1160 assert(mSize >= 1 && mSize <= 3);
1164 out << (mSize == 1 ?
"<em>" : mSize == 2 ?
"<strong>" :
"<strong><em>");
1168 out << (mSize == 1 ?
"</em>" : mSize == 2 ?
"</strong>" :
"</em></strong>");
1173 out << std::string(mSize, mTokenCharacter);
1183 if (mMatch !=
nullptr)
1185 std::string type = (mSize == 1 ?
"italic" : mSize == 2 ?
"bold" :
"italic&bold");
1189 out <<
"Matched open-" << type <<
" marker\n";
1193 out <<
"Matched close-" << type <<
" marker\n";
1199 out <<
"Unmatched bold/italic open marker: "
1200 << std::string(mSize, mTokenCharacter) <<
"\n";
1202 out <<
"Unmatched bold/italic close marker: "
1203 << std::string(mSize, mTokenCharacter) <<
"\n";
1211 out <<
"<img src=\"" << mUrl <<
"\" alt=\"" << mAltText <<
"\"";
1213 if (!mTitle.empty())
1215 out <<
" title=\"" << mTitle <<
"\"";