13 #include <boost/regex.hpp>
15 #include <SimoxUtility/algorithm/string/string_tools.h>
33 const std::string cHtmlTokenSource(
34 "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))>");
35 const boost::regex cHtmlTokenExpression(cHtmlTokenSource),
36 cStartHtmlTokenExpression(
"^" + cHtmlTokenSource),
37 cOneHtmlTokenExpression(
"^" + cHtmlTokenSource +
"$");
39 enum ParseHtmlTagFlags
45 std::optional<HtmlTagInfo>
46 parseHtmlTag(std::string::const_iterator begin,
47 std::string::const_iterator end,
48 ParseHtmlTagFlags flags)
52 if (boost::regex_search(
56 (flags == cAlone ? cOneHtmlTokenExpression : cStartHtmlTokenExpression)))
66 r.isClosingTag = (m[2].length() > 0);
67 r.lengthOfToken = m[0].length();
75 parseInlineHtmlText(
const std::string& src)
78 std::string::const_iterator prev = src.begin(), end = src.end();
84 if (boost::regex_search(prev, end, m, cHtmlTokenExpression))
86 if (prev != m[0].first)
103 eol = std::string(prev, end);
117 isHtmlCommentStart(std::string::const_iterator begin, std::string::const_iterator end)
121 static const boost::regex cExpression(
"^<!--");
122 return boost::regex_search(begin, end, cExpression);
126 isHtmlCommentEnd(std::string::const_iterator begin, std::string::const_iterator end)
128 static const boost::regex cExpression(
".*-- *>$");
129 return boost::regex_match(begin, end, cExpression);
133 isBlankLine(
const std::string& line)
135 static const boost::regex cExpression(
" {0,3}(<--(.*)-- *> *)* *");
136 return boost::regex_match(line, cExpression);
139 std::optional<TokenPtr>
145 const std::string line(*(*i)->text());
147 bool tag =
false, comment =
false;
148 std::optional<HtmlTagInfo> tagInfo = parseHtmlTag(line.begin(), line.end(), cStarts);
154 else if (isHtmlCommentStart(line.begin(), line.end()))
176 contents.splice(contents.end(), t);
180 contents.push_back(*i);
187 if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
189 if (prevLine == firstLine)
195 const std::string text(*(*prevLine)->text());
197 if (parseHtmlTag(text.begin(), text.end(), cAlone))
203 }
while (i != end && !done);
237 contents.push_back(*i);
243 if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
245 if (prevLine == firstLine)
251 const std::string text(*(*prevLine)->text());
253 if (isHtmlCommentEnd(text.begin(), text.end()))
259 }
while (i != end && !done);
269 std::optional<std::string>
272 if ((*i)->isBlankLine())
279 std::optional<std::string> r = isCodeBlockLine(i, end);
283 return std::string(
"\n" + *r);
289 else if ((*i)->text() && (*i)->canContainMarkup())
291 std::string line(*(*i)->text());
293 if (line.length() >= 4)
295 std::string::iterator si = line.begin(), sie = si + 4;
297 while (si != sie && *si ==
' ')
305 return std::string(si, line.end());
313 std::optional<TokenPtr>
316 if (!(*i)->isBlankLine())
318 std::optional<std::string> contents = isCodeBlockLine(i, end);
322 std::ostringstream out;
323 out << *contents <<
'\n';
327 contents = isCodeBlockLine(i, end);
331 out << *contents <<
'\n';
347 countQuoteLevel(
const std::string& prefixString)
351 for (
char qi : prefixString)
360 std::optional<TokenPtr>
363 static const boost::regex cBlockQuoteExpression(
"^((?: {0,3}>)+) (.*)$");
366 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
368 const std::string line(*(*i)->text());
371 if (boost::regex_match(line, m, cBlockQuoteExpression))
373 size_t quoteLevel = countQuoteLevel(m[1]);
374 boost::regex continuationExpression =
375 boost::regex(
"^((?: {0,3}>){" +
std::to_string(quoteLevel) +
"}) ?(.*)$");
389 if ((*i)->isBlankLine())
401 const std::string line(*(*ii)->text());
403 if (boost::regex_match(line, m, continuationExpression))
405 if (m[1].matched && m[1].length() > 0)
425 const std::string line(*(*i)->text());
427 if (boost::regex_match(line, m, continuationExpression))
429 assert(m[2].matched);
431 if (!isBlankLine(m[2]))
456 std::optional<TokenPtr>
459 static const boost::regex cUnorderedListExpression(
"^( *)([*+-]) +([^*-].*)$");
460 static const boost::regex cOrderedListExpression(
"^( *)([0-9]+)\\. +(.*)$");
469 ListType type = cNone;
471 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
473 boost::regex nextItemExpression, startSublistExpression;
476 const std::string line((*i)->text().value());
484 if (boost::regex_match(line, m, cUnorderedListExpression))
486 indent = m[1].length();
488 if (
sub || indent < 4)
491 char startChar = *m[2].first;
494 std::ostringstream next;
495 next <<
"^" << std::string(indent,
' ') <<
"\\" << startChar <<
" +([^*-].*)$";
496 nextItemExpression = next.str();
499 else if (boost::regex_match(line, m, cOrderedListExpression))
501 indent = m[1].length();
503 if (
sub || indent < 4)
508 std::ostringstream next;
509 next <<
"^" << std::string(indent,
' ') <<
"[0-9]+\\. +(.*)$";
510 nextItemExpression = next.str();
517 size_t itemCount = 1;
518 std::ostringstream
sub;
519 sub <<
"^" << std::string(indent,
' ') <<
" +(([*+-])|([0-9]+\\.)) +.*$";
520 startSublistExpression =
sub.str();
535 static const boost::regex cContinuedItemExpression(
"^ *([^ ].*)$");
537 boost::regex continuedAfterBlankLineExpression(
"^ {" +
std::to_string(indent + 4) +
539 boost::regex codeBlockAfterBlankLineExpression(
"^ {" +
std::to_string(indent + 8) +
549 NextItemType nextItem = cUnknown;
550 bool setParagraphMode =
false;
556 if ((*i)->isBlankLine())
564 nextItem = cEndOfList;
566 else if ((*ii)->text())
568 const std::string line(*(*ii)->text());
570 if (boost::regex_match(line, startSublistExpression))
572 setParagraphMode =
true;
575 std::optional<TokenPtr> p = parseListBlock(i, end,
true);
577 subItemTokens.push_back(*p);
580 else if (boost::regex_match(line, m, nextItemExpression))
582 setParagraphMode =
true;
584 nextItem = cAnotherItem;
586 else if (boost::regex_match(line, m, continuedAfterBlankLineExpression))
588 assert(m[1].matched);
590 subItemTokens.push_back(
595 else if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression))
597 setParagraphMode =
true;
599 assert(m[1].matched);
602 std::string codeBlock = m[1] +
'\n';
607 if ((*ii)->isBlankLine())
611 const std::string nextLine(*(*iii)->text());
613 if (boost::regex_match(
614 nextLine, m, codeBlockAfterBlankLineExpression))
616 codeBlock +=
'\n' + m[1] +
'\n';
624 else if ((*ii)->text())
626 const std::string line(*(*ii)->text());
628 if (boost::regex_match(
629 line, m, codeBlockAfterBlankLineExpression))
631 codeBlock += m[1] +
'\n';
646 subItemTokens.push_back(
653 nextItem = cEndOfList;
661 else if ((*i)->text())
663 const std::string line(*(*i)->text());
665 if (boost::regex_match(line, startSublistExpression))
668 std::optional<TokenPtr> p = parseListBlock(i, end,
true);
670 subItemTokens.push_back(*p);
673 else if (boost::regex_match(line, m, nextItemExpression))
675 nextItem = cAnotherItem;
679 if (boost::regex_match(line, m, cUnorderedListExpression) ||
680 boost::regex_match(line, m, cOrderedListExpression))
683 nextItem = cEndOfList;
687 boost::regex_match(line, m, cContinuedItemExpression);
688 assert(m[1].matched);
689 subItemTokens.push_back(
698 nextItem = cEndOfList;
701 if (!subItemTokens.empty())
704 subItemTokens.clear();
707 assert(nextItem != cUnknown);
709 if (nextItem == cAnotherItem)
722 if (!subItemTokens.empty())
725 subItemTokens.clear();
728 if (itemCount > 1 || indent != 0)
730 if (type == cUnordered)
758 static const boost::regex cReference(
759 "^ {0,3}\\[(.+)\\]: +<?([^ >]+)>?(?: *(?:('|\")(.*)\\3)|(?:\\((.*)\\)))?$");
762 const std::string line1(*(*i)->text());
765 if (boost::regex_match(line1, m, cReference))
767 std::string id(m[1]), url(m[2]), title;
773 else if (m[5].matched)
782 if (ii != end && (*ii)->text())
785 static const boost::regex cSeparateTitle(
786 "^ *(?:(?:('|\")(.*)\\1)|(?:\\((.*)\\))) *$");
789 const std::string line2(*(*ii)->text());
791 if (boost::regex_match(line2, m, cSeparateTitle))
794 title = (m[2].matched ? m[2] : m[3]);
799 idTable.
add(
id, url, title);
808 flushParagraph(std::string& paragraphText,
813 if (!paragraphText.empty())
816 paragraphText.clear();
819 if (!paragraphTokens.empty())
823 if (paragraphTokens.size() > 1)
825 finalTokens.push_back(
830 finalTokens.push_back(*paragraphTokens.begin());
838 paragraphTokens.clear();
842 std::optional<TokenPtr>
845 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
848 static const boost::regex cHashHeaders(
"^(#{1,6}) +(.*?) *#*$");
849 const std::string line = *(*i)->text();
852 if (boost::regex_match(line, m, cHashHeaders))
861 if (ii != end && !(*ii)->isBlankLine() && (*ii)->text() && (*ii)->canContainMarkup())
863 static const boost::regex cUnderlinedHeaders(
"^([-=])\\1*$");
864 const std::string line = *(*ii)->text();
866 if (boost::regex_match(line, m, cUnderlinedHeaders))
868 char typeChar = std::string(m[1])[0];
880 std::optional<TokenPtr>
883 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
885 static const boost::regex cHorizontalRules(
"^ {0,3}((?:-|\\*|_) *){3,}$");
886 const std::string line = *(*i)->text();
888 if (boost::regex_match(line, cHorizontalRules))
902 std::optional<LinkIds::Target>
905 Table::const_iterator i = mTable.find(_scrubKey(
id));
907 if (i != mTable.end())
918 LinkIds::add(
const std::string&
id,
const std::string& url,
const std::string& title)
920 mTable.insert(std::make_pair(_scrubKey(
id),
Target(url, title)));
924 LinkIds::_scrubKey(std::string
str)
929 const size_t Document::cSpacesPerInitialTab = 4;
930 const size_t Document::cDefaultSpacesPerTab = cSpacesPerInitialTab;
933 cSpacesPerTab(spacesPerTab),
934 mTokenContainer(new token::Container),
942 cSpacesPerTab(spacesPerTab),
943 mTokenContainer(new token::Container),
958 std::istringstream in(src);
963 Document::_getline(std::istream& in, std::string& line)
969 bool initialWhitespace =
true;
976 if ((in.get(
c)) &&
c !=
'\n')
985 if ((in.get(
c)) &&
c !=
'\r')
994 size_t convert = (initialWhitespace ? cSpacesPerInitialTab : cSpacesPerTab);
1003 initialWhitespace =
false;
1008 return !line.empty();
1020 assert(tokens != 0);
1025 while (_getline(in, line))
1027 if (isBlankLine(line))
1046 mTokenContainer->writeAsHtml(out);
1053 mTokenContainer->writeToken(0, out);
1057 Document::_process()
1061 _mergeMultilineHtmlTags();
1062 _processInlineHtmlAndReferences();
1063 _processBlocksItems(mTokenContainer);
1064 _processParagraphLines(mTokenContainer);
1065 mTokenContainer->processSpanElements(*mIdTable);
1071 Document::_mergeMultilineHtmlTags()
1073 static const boost::regex cHtmlTokenStart(
1074 "<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))$");
1075 static const boost::regex cHtmlTokenEnd(
1076 "^ *((?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\3))*? */? *))>");
1080 token::Container* tokens =
dynamic_cast<token::Container*
>(mTokenContainer.get());
1081 assert(tokens != 0);
1083 for (TokenGroup::const_iterator i = tokens->subTokens().begin(),
1084 ie = tokens->subTokens().end();
1088 if ((*i)->text() && boost::regex_match(*(*i)->text(), cHtmlTokenStart))
1090 TokenGroup::const_iterator i2 = i;
1093 if (i2 != tokens->subTokens().end() && (*i2)->text() &&
1094 boost::regex_match(*(*i2)->text(), cHtmlTokenEnd))
1103 processed.push_back(*i);
1106 tokens->swapSubtokens(processed);
1110 Document::_processInlineHtmlAndReferences()
1114 token::Container* tokens =
dynamic_cast<token::Container*
>(mTokenContainer.get());
1115 assert(tokens != 0);
1117 for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1118 iie = tokens->subTokens().end();
1124 if (processed.empty() || processed.back()->isBlankLine())
1126 std::optional<TokenPtr> inlineHtml = parseInlineHtml(ii, iie);
1130 processed.push_back(*inlineHtml);
1141 if (parseReference(ii, iie, *mIdTable))
1157 processed.push_back(*ii);
1160 tokens->swapSubtokens(processed);
1164 Document::_processBlocksItems(
TokenPtr inTokenContainer)
1166 if (!inTokenContainer->isContainer())
1171 token::Container* tokens =
dynamic_cast<token::Container*
>(inTokenContainer.get());
1172 assert(tokens != 0);
1176 for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1177 iie = tokens->subTokens().end();
1183 std::optional<TokenPtr> subitem;
1187 subitem = parseHeader(ii, iie);
1192 subitem = parseHorizontalRule(ii, iie);
1197 subitem = parseListBlock(ii, iie);
1202 subitem = parseBlockQuote(ii, iie);
1207 subitem = parseCodeBlock(ii, iie);
1212 _processBlocksItems(*subitem);
1213 processed.push_back(*subitem);
1224 processed.push_back(*ii);
1227 else if ((*ii)->isContainer())
1229 _processBlocksItems(*ii);
1230 processed.push_back(*ii);
1234 tokens->swapSubtokens(processed);
1238 Document::_processParagraphLines(
TokenPtr inTokenContainer)
1240 token::Container* tokens =
dynamic_cast<token::Container*
>(inTokenContainer.get());
1241 assert(tokens != 0);
1243 bool noPara = tokens->inhibitParagraphs();
1245 for (
const auto& ii : tokens->subTokens())
1246 if (ii->isContainer())
1248 _processParagraphLines(ii);
1252 std::string paragraphText;
1255 for (
const auto& ii : tokens->subTokens())
1257 if (ii->text() && ii->canContainMarkup() && !ii->inhibitParagraphs())
1259 static const boost::regex cExpression(
"^(.*) $");
1261 if (!paragraphText.empty())
1263 paragraphText +=
" ";
1268 if (boost::regex_match(*ii->text(), m, cExpression))
1270 paragraphText += m[1];
1271 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1276 paragraphText += *ii->text();
1281 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1282 processed.push_back(ii);
1287 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1289 tokens->swapSubtokens(processed);