11 #include <SimoxUtility/algorithm/string/string_tools.h>
13 #include <boost/regex.hpp>
32 const std::string cHtmlTokenSource(
"<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))>");
33 const boost::regex cHtmlTokenExpression(cHtmlTokenSource),
34 cStartHtmlTokenExpression(
"^" + cHtmlTokenSource),
35 cOneHtmlTokenExpression(
"^" + cHtmlTokenSource +
"$");
37 enum ParseHtmlTagFlags { cAlone, cStarts };
39 std::optional<HtmlTagInfo> parseHtmlTag(std::string::const_iterator begin,
40 std::string::const_iterator end, ParseHtmlTagFlags flags)
44 if (boost::regex_search(begin, end, m, (flags == cAlone ?
45 cOneHtmlTokenExpression : cStartHtmlTokenExpression)))
55 r.isClosingTag = (m[2].length() > 0);
56 r.lengthOfToken = m[0].length();
66 std::string::const_iterator prev = src.begin(), end = src.end();
72 if (boost::regex_search(prev, end, m, cHtmlTokenExpression))
74 if (prev != m[0].first)
90 eol = std::string(prev, end);
103 bool isHtmlCommentStart(std::string::const_iterator begin,
104 std::string::const_iterator end)
108 static const boost::regex cExpression(
"^<!--");
109 return boost::regex_search(begin, end, cExpression);
112 bool isHtmlCommentEnd(std::string::const_iterator begin,
113 std::string::const_iterator end)
115 static const boost::regex cExpression(
".*-- *>$");
116 return boost::regex_match(begin, end, cExpression);
119 bool isBlankLine(
const std::string& line)
121 static const boost::regex cExpression(
" {0,3}(<--(.*)-- *> *)* *");
122 return boost::regex_match(line, cExpression);
130 const std::string line(*(*i)->text());
132 bool tag =
false, comment =
false;
133 std::optional<HtmlTagInfo> tagInfo = parseHtmlTag(line.begin(), line.end(), cStarts);
139 else if (isHtmlCommentStart(line.begin(), line.end()))
161 contents.splice(contents.end(), t);
165 contents.push_back(*i);
172 if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
174 if (prevLine == firstLine)
180 const std::string text(*(*prevLine)->text());
182 if (parseHtmlTag(text.begin(), text.end(), cAlone))
189 while (i != end && !done);
222 contents.push_back(*i);
228 if (i != end && (*i)->isBlankLine() && (*prevLine)->text())
230 if (prevLine == firstLine)
236 const std::string text(*(*prevLine)->text());
238 if (isHtmlCommentEnd(text.begin(), text.end()))
245 while (i != end && !done);
257 if ((*i)->isBlankLine())
264 std::optional<std::string> r = isCodeBlockLine(i, end);
268 return std::string(
"\n" + *r);
274 else if ((*i)->text() && (*i)->canContainMarkup())
276 std::string line(*(*i)->text());
278 if (line.length() >= 4)
280 std::string::iterator si = line.begin(), sie = si + 4;
282 while (si != sie && *si ==
' ')
290 return std::string(si, line.end());
300 if (!(*i)->isBlankLine())
302 std::optional<std::string> contents = isCodeBlockLine(i, end);
306 std::ostringstream out;
307 out << *contents <<
'\n';
311 contents = isCodeBlockLine(i, end);
315 out << *contents <<
'\n';
332 size_t countQuoteLevel(
const std::string& prefixString)
336 for (
char qi : prefixString)
347 static const boost::regex cBlockQuoteExpression(
"^((?: {0,3}>)+) (.*)$");
350 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
352 const std::string line(*(*i)->text());
355 if (boost::regex_match(line, m, cBlockQuoteExpression))
357 size_t quoteLevel = countQuoteLevel(m[1]);
358 boost::regex continuationExpression = boost::regex(
"^((?: {0,3}>){" +
std::to_string(quoteLevel) +
"}) ?(.*)$");
372 if ((*i)->isBlankLine())
384 const std::string line(*(*ii)->text());
386 if (boost::regex_match(line, m, continuationExpression))
388 if (m[1].matched && m[1].length() > 0)
407 const std::string line(*(*i)->text());
409 if (boost::regex_match(line, m, continuationExpression))
411 assert(m[2].matched);
413 if (!isBlankLine(m[2]))
440 static const boost::regex cUnorderedListExpression(
"^( *)([*+-]) +([^*-].*)$");
441 static const boost::regex cOrderedListExpression(
"^( *)([0-9]+)\\. +(.*)$");
443 enum ListType { cNone, cUnordered, cOrdered };
444 ListType type = cNone;
446 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
448 boost::regex nextItemExpression, startSublistExpression;
451 const std::string line((*i)->text().value());
459 if (boost::regex_match(line, m, cUnorderedListExpression))
461 indent = m[1].length();
463 if (
sub || indent < 4)
466 char startChar = *m[2].first;
469 std::ostringstream next;
470 next <<
"^" << std::string(indent,
' ') <<
"\\" << startChar <<
" +([^*-].*)$";
471 nextItemExpression = next.str();
474 else if (boost::regex_match(line, m, cOrderedListExpression))
476 indent = m[1].length();
478 if (
sub || indent < 4)
483 std::ostringstream next;
484 next <<
"^" << std::string(indent,
' ') <<
"[0-9]+\\. +(.*)$";
485 nextItemExpression = next.str();
492 size_t itemCount = 1;
493 std::ostringstream
sub;
494 sub <<
"^" << std::string(indent,
' ') <<
" +(([*+-])|([0-9]+\\.)) +.*$";
495 startSublistExpression =
sub.str();
510 static const boost::regex cContinuedItemExpression(
"^ *([^ ].*)$");
512 boost::regex continuedAfterBlankLineExpression(
"^ {" +
514 boost::regex codeBlockAfterBlankLineExpression(
"^ {" +
517 enum NextItemType { cUnknown, cEndOfList, cAnotherItem };
518 NextItemType nextItem = cUnknown;
519 bool setParagraphMode =
false;
525 if ((*i)->isBlankLine())
533 nextItem = cEndOfList;
535 else if ((*ii)->text())
537 const std::string line(*(*ii)->text());
539 if (boost::regex_match(line, startSublistExpression))
541 setParagraphMode =
true;
544 std::optional<TokenPtr> p = parseListBlock(i, end,
true);
546 subItemTokens.push_back(*p);
549 else if (boost::regex_match(line, m, nextItemExpression))
551 setParagraphMode =
true;
553 nextItem = cAnotherItem;
555 else if (boost::regex_match(line, m, continuedAfterBlankLineExpression))
557 assert(m[1].matched);
563 else if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression))
565 setParagraphMode =
true;
567 assert(m[1].matched);
570 std::string codeBlock = m[1] +
'\n';
575 if ((*ii)->isBlankLine())
579 const std::string nextLine(*(*iii)->text());
581 if (boost::regex_match(nextLine, m, codeBlockAfterBlankLineExpression))
583 codeBlock +=
'\n' + m[1] +
'\n';
591 else if ((*ii)->text())
593 const std::string line(*(*ii)->text());
595 if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression))
597 codeBlock += m[1] +
'\n';
618 nextItem = cEndOfList;
626 else if ((*i)->text())
628 const std::string line(*(*i)->text());
630 if (boost::regex_match(line, startSublistExpression))
633 std::optional<TokenPtr> p = parseListBlock(i, end,
true);
635 subItemTokens.push_back(*p);
638 else if (boost::regex_match(line, m, nextItemExpression))
640 nextItem = cAnotherItem;
644 if (boost::regex_match(line, m, cUnorderedListExpression)
645 || boost::regex_match(line, m, cOrderedListExpression))
648 nextItem = cEndOfList;
652 boost::regex_match(line, m, cContinuedItemExpression);
653 assert(m[1].matched);
662 nextItem = cEndOfList;
665 if (!subItemTokens.empty())
668 subItemTokens.clear();
671 assert(nextItem != cUnknown);
673 if (nextItem == cAnotherItem)
686 if (!subItemTokens.empty())
689 subItemTokens.clear();
692 if (itemCount > 1 || indent != 0)
694 if (type == cUnordered)
719 static const boost::regex cReference(
"^ {0,3}\\[(.+)\\]: +<?([^ >]+)>?(?: *(?:('|\")(.*)\\3)|(?:\\((.*)\\)))?$");
722 const std::string line1(*(*i)->text());
725 if (boost::regex_match(line1, m, cReference))
727 std::string id(m[1]), url(m[2]), title;
733 else if (m[5].matched)
742 if (ii != end && (*ii)->text())
745 static const boost::regex cSeparateTitle(
"^ *(?:(?:('|\")(.*)\\1)|(?:\\((.*)\\))) *$");
748 const std::string line2(*(*ii)->text());
750 if (boost::regex_match(line2, m, cSeparateTitle))
753 title = (m[2].matched ? m[2] : m[3]);
758 idTable.
add(
id, url, title);
769 if (!paragraphText.empty())
772 paragraphText.clear();
775 if (!paragraphTokens.empty())
779 if (paragraphTokens.size() > 1)
785 finalTokens.push_back(*paragraphTokens.begin());
793 paragraphTokens.clear();
799 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
802 static const boost::regex cHashHeaders(
"^(#{1,6}) +(.*?) *#*$");
803 const std::string line = *(*i)->text();
806 if (boost::regex_match(line, m, cHashHeaders))
815 if (ii != end && !(*ii)->isBlankLine() && (*ii)->text() && (*ii)->canContainMarkup())
817 static const boost::regex cUnderlinedHeaders(
"^([-=])\\1*$");
818 const std::string line = *(*ii)->text();
820 if (boost::regex_match(line, m, cUnderlinedHeaders))
822 char typeChar = std::string(m[1])[0];
824 ? 1 : 2), *(*i)->text()));
836 if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup())
838 static const boost::regex cHorizontalRules(
"^ {0,3}((?:-|\\*|_) *){3,}$");
839 const std::string line = *(*i)->text();
841 if (boost::regex_match(line, cHorizontalRules))
859 Table::const_iterator i = mTable.find(_scrubKey(
id));
861 if (i != mTable.end())
874 mTable.insert(std::make_pair(_scrubKey(
id),
Target(url, title)));
877 std::string LinkIds::_scrubKey(std::string
str)
879 return simox::alg::to_lower(
str);
884 const size_t Document::cSpacesPerInitialTab = 4;
885 const size_t Document::cDefaultSpacesPerTab = cSpacesPerInitialTab;
888 mTokenContainer(new token::Container), mIdTable(new
LinkIds),
895 cSpacesPerTab(spacesPerTab), mTokenContainer(new token::Container),
896 mIdTable(new
LinkIds), mProcessed(false)
908 std::istringstream in(src);
912 bool Document::_getline(std::istream& in, std::string& line)
918 bool initialWhitespace =
true;
925 if ((in.get(
c)) &&
c !=
'\n')
934 if ((in.get(
c)) &&
c !=
'\r')
943 size_t convert = (initialWhitespace ? cSpacesPerInitialTab :
953 initialWhitespace =
false;
958 return !line.empty();
974 while (_getline(in, line))
976 if (isBlankLine(line))
994 mTokenContainer->writeAsHtml(out);
1000 mTokenContainer->writeToken(0, out);
1003 void Document::_process()
1007 _mergeMultilineHtmlTags();
1008 _processInlineHtmlAndReferences();
1009 _processBlocksItems(mTokenContainer);
1010 _processParagraphLines(mTokenContainer);
1011 mTokenContainer->processSpanElements(*mIdTable);
1016 void Document::_mergeMultilineHtmlTags()
1018 static const boost::regex cHtmlTokenStart(
"<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))$");
1019 static const boost::regex cHtmlTokenEnd(
"^ *((?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\3))*? */? *))>");
1023 token::Container* tokens =
dynamic_cast<token::Container*
>(mTokenContainer.get());
1024 assert(tokens != 0);
1026 for (TokenGroup::const_iterator i = tokens->subTokens().begin(),
1027 ie = tokens->subTokens().end(); i != ie; ++i)
1029 if ((*i)->text() && boost::regex_match(*(*i)->text(), cHtmlTokenStart))
1031 TokenGroup::const_iterator i2 = i;
1034 if (i2 != tokens->subTokens().end() && (*i2)->text() &&
1035 boost::regex_match(*(*i2)->text(), cHtmlTokenEnd))
1043 processed.push_back(*i);
1046 tokens->swapSubtokens(processed);
1049 void Document::_processInlineHtmlAndReferences()
1053 token::Container* tokens =
dynamic_cast<token::Container*
>(mTokenContainer.get());
1054 assert(tokens != 0);
1056 for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1057 iie = tokens->subTokens().end(); ii != iie; ++ii)
1061 if (processed.empty() || processed.back()->isBlankLine())
1063 std::optional<TokenPtr> inlineHtml = parseInlineHtml(ii, iie);
1067 processed.push_back(*inlineHtml);
1078 if (parseReference(ii, iie, *mIdTable))
1094 processed.push_back(*ii);
1097 tokens->swapSubtokens(processed);
1100 void Document::_processBlocksItems(
TokenPtr inTokenContainer)
1102 if (!inTokenContainer->isContainer())
1107 token::Container* tokens =
dynamic_cast<token::Container*
>(inTokenContainer.get());
1108 assert(tokens != 0);
1112 for (TokenGroup::const_iterator ii = tokens->subTokens().begin(),
1113 iie = tokens->subTokens().end(); ii != iie; ++ii)
1117 std::optional<TokenPtr> subitem;
1121 subitem = parseHeader(ii, iie);
1126 subitem = parseHorizontalRule(ii, iie);
1131 subitem = parseListBlock(ii, iie);
1136 subitem = parseBlockQuote(ii, iie);
1141 subitem = parseCodeBlock(ii, iie);
1146 _processBlocksItems(*subitem);
1147 processed.push_back(*subitem);
1158 processed.push_back(*ii);
1161 else if ((*ii)->isContainer())
1163 _processBlocksItems(*ii);
1164 processed.push_back(*ii);
1168 tokens->swapSubtokens(processed);
1171 void Document::_processParagraphLines(
TokenPtr inTokenContainer)
1173 token::Container* tokens =
dynamic_cast<token::Container*
>(inTokenContainer.get());
1174 assert(tokens != 0);
1176 bool noPara = tokens->inhibitParagraphs();
1178 for (
const auto& ii : tokens->subTokens())
1179 if (ii->isContainer())
1181 _processParagraphLines(ii);
1185 std::string paragraphText;
1188 for (
const auto& ii : tokens->subTokens())
1190 if (ii->text() && ii->canContainMarkup() && !ii->inhibitParagraphs())
1192 static const boost::regex cExpression(
"^(.*) $");
1194 if (!paragraphText.empty())
1196 paragraphText +=
" ";
1201 if (boost::regex_match(*ii->text(), m, cExpression))
1203 paragraphText += m[1];
1204 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1209 paragraphText += *ii->text();
1214 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1215 processed.push_back(ii);
1220 flushParagraph(paragraphText, paragraphTokens, processed, noPara);
1222 tokens->swapSubtokens(processed);