LCOV - code coverage report
Current view: top level - src/utilities - HTMLSanitizer.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 95.8 % 285 273
Test Date: 2026-04-19 00:35:54 Functions: 100.0 % 12 12

            Line data    Source code
       1              : #include "HTMLSanitizer.h"
       2              : #include "FangLogging.h"
       3              : 
       4              : #include <QXmlStreamReader>
       5              : #include <QXmlStreamWriter>
       6              : #include <QStack>
       7              : 
       8              : #include "QImageCache.h"
       9              : #include "WebUtilities.h"
      10              : #include "QTidyLibClassic.h"
      11              : 
      12              : // Strings.
      13              : #define S_WIDTH "width"
      14              : #define S_HEIGHT "height"
      15              : #define S_SRC "src"
      16              : #define S_IMG "img"
      17              : #define S_HREF "href"
      18              : #define S_ID "id"
      19              : 
      20              : namespace {
      21              : 
      22              : /*!
      23              :     \brief Represents a DOM node during HTML parsing.
      24              :  */
      25              : class DOMNode {
      26              : public:
      27          259 :     DOMNode(QString tagName, int intID) :
      28          259 :         tagName(tagName),
      29          259 :         intID(intID),
      30          259 :         nonEmptyTextCount(0),
      31          259 :         numChildren(0)
      32          259 :     {}
      33              : 
      34              :     // Stack requires a default c'tor
      35              :     DOMNode() :
      36              :         intID(0),
      37              :         nonEmptyTextCount(0),
      38              :         numChildren(0)
      39              :     {}
      40              : 
      41              :     QString tagName;
      42              :     int intID;
      43              :     int nonEmptyTextCount;
      44              :     int numChildren;
      45              : };
      46              : 
      47              : } // anonymous namespace
      48              : 
      49           41 : HTMLSanitizer::HTMLSanitizer(QObject *parent) :
      50              :     QObject(parent),
      51           41 :     currentId(0)
      52              : {
      53           41 :     tagsToRemove << "script"    // Javascript
      54           82 :                  << "title"     // Titles WTF?
      55           82 :                  << "head"      // Don't need head
      56           82 :                  << "style"     // Custom styles.
      57           82 :                  << "iframe"    // Iframes!
      58           82 :                  << "object"    // Plugins!
      59           82 :                  << "embed"     // Other plugins!
      60           41 :                  << "hr";       // No horizontals allowed; they're ugly.
      61              : 
      62           41 :     classesToRemove << "feedflare"                  // Feedburger's 37 pieces of flare
      63           82 :                     << "mf-viral"                   // Motherfucking viral?
      64           41 :                     << "service-links-stumbleupon"; // StubbleUponYourFace
      65              : 
      66           41 :     shareButtonURLs << "twitter.com/home?status"
      67           82 :                     << "plus.google.com/shar"
      68           82 :                     << "facebook.com/shar"
      69           82 :                     << "feedsportal.com/"
      70           82 :                     << "api.tweetmeme.com/"
      71           82 :                     << "stumbleupon.com/submit"
      72           41 :                     << "share.feedsportal.com/share";
      73              : 
      74           41 :      containerTags << "p"
      75           82 :                    << "div"
      76           82 :                    << "span"
      77           41 :                    << "pre";
      78              : 
      79           58 :     urlTransform = [](const QString& url) {
      80           17 :         return WebUtilities::urlFixup(url);
      81           41 :     };
      82           41 : }
      83              : 
      84           56 : void HTMLSanitizer::reset()
      85              : {
      86           56 :     idsToDelete.clear();
      87           56 :     currentId = 0;
      88           56 : }
      89              : 
      90          454 : bool HTMLSanitizer::isHTMLEmpty(const QString& html)
      91              : {
      92          454 :     QString copy = html;
      93          454 :     copy.replace(" ", "");
      94          454 :     copy.replace("\t", "");
      95          454 :     copy.replace("\n", "");
      96              : 
      97          908 :     return copy.size() == 0;
      98          454 : }
      99              : 
     100           25 : bool HTMLSanitizer::isShareURL(const QString &url)
     101              : {
     102          188 :     for (const QString& shareURL : shareButtonURLs) {
     103          165 :         if (url.contains(shareURL, Qt::CaseInsensitive)) {
     104            2 :             return true;
     105              :         }
     106              :     }
     107              : 
     108           23 :     return false;
     109              : }
     110              : 
     111          285 : QString HTMLSanitizer::intToID(int id)
     112              : {
     113          285 :     return "FangID_" + QString::number(id);
     114              : }
     115              : 
     116           53 : QString HTMLSanitizer::sanitize(const QString &document, QSet<QUrl> &imageURLs)
     117              : {
     118              :     // We use TidyLib to convert the (potentially crappy) HTML into proper
     119              :     // XHTML. This will add a doctype and other unwanted headers/footers, so we strip those
     120              :     // out in a separate post-processing method. You'll see.
     121           53 :     QString doc = QTidyLibClassic::toXhtml("<html><body>" + document + "</body></html>");
     122           53 :     if (doc.isEmpty()) {
     123            0 :         qCDebug(logRewriter) << "Error loading HTML document";
     124              : 
     125            0 :         return "";
     126              :     }
     127              : 
     128              :     // Swap out non-breaking spaces here since QXmlStreamReader doesn't handle them well.
     129           53 :     doc.replace("&nbsp;", " ", Qt::CaseInsensitive);
     130              : 
     131              :     // We're going to count the number of tags to determine if this is a real HTML document,
     132              :     // or a text document.
     133           53 :     int tagCount = 0;
     134              : 
     135           53 :     QXmlStreamReader xml;
     136           53 :     xml.addData(doc);
     137              : 
     138           53 :     QString output;
     139           53 :     QXmlStreamWriter writer(&output);
     140           53 :     writer.setAutoFormatting(false);
     141              : 
     142              :     // If we're skipping elements, this is >= 1
     143           53 :     int skip = 0;
     144              : 
     145              :     // Current stack.
     146           53 :     QStack<DOMNode> stack;
     147              : 
     148              :     // Was the last node text?
     149           53 :     bool lastWasText = false;
     150              : 
     151              :     // Track nesting depth inside <pre> to preserve whitespace.
     152           53 :     int preDepth = 0;
     153              : 
     154         1720 :     while (!xml.atEnd()) {
     155              :         // Grab the next thingie.
     156         1667 :         xml.readNext();
     157              : 
     158         1667 :         if (xml.isStartElement()) {
     159              :             // Start
     160          441 :             tagCount++;
     161              : 
     162          441 :             if (0 == skip) {
     163          332 :                 QString tagName = xml.name().toString().toLower();
     164          664 :                 QString classValue = xml.attributes().value("class").toString();
     165          664 :                 QString href = xml.attributes().value(S_HREF).toString();
     166              : 
     167          602 :                 if (tagsToRemove.contains(tagName) ||
     168          539 :                         classesToRemove.contains(classValue) || // Delete known bad classes
     169          904 :                         (tagName == "a" && isShareURL(href)) || // Delete share links
     170          609 :                         (tagName == "br" && !lastWasText)) {    // Delete br's that weren't preceeded by text.
     171              :                     // Skip it good!
     172           73 :                     skip = 1;
     173              :                 } else {
     174              :                     // Write the tag.
     175          259 :                     writer.writeStartElement(tagName);
     176              : 
     177          259 :                     currentId++;
     178          518 :                     writer.writeAttribute(S_ID, intToID(currentId));
     179              : 
     180              :                     // If there's a parent node, add a child.
     181          259 :                     if (stack.size()) {
     182          206 :                         stack.top().numChildren++;
     183              :                     }
     184              : 
     185              :                     // Push it.
     186          259 :                     stack.push(DOMNode(tagName, currentId));
     187              : 
     188          259 :                     if (tagName == "pre") {
     189            5 :                         preDepth++;
     190              :                     }
     191              : 
     192              :                     // Anchor tags.
     193          282 :                     if (tagName == "a" && xml.attributes().hasAttribute(S_HREF)) {
     194           69 :                         writer.writeAttribute(S_HREF, xml.attributes().value(S_HREF).toString());
     195              :                     }
     196              : 
     197              :                     // Image tags.
     198          276 :                     if (tagName == S_IMG && xml.attributes().hasAttribute(S_SRC)) {
     199           34 :                         QString imgSrc = xml.attributes().value(S_SRC).toString();
     200           17 :                         if (urlTransform) {
     201           17 :                             imgSrc = urlTransform(imgSrc);
     202              :                         }
     203           34 :                         writer.writeAttribute(S_SRC, imgSrc);
     204              : 
     205              :                         // WordPress emoji: class="wp-smiley" images are inline emoji
     206              :                         // that should render at text size (~16px), not at their
     207              :                         // natural pixel dimensions (typically 72x72).
     208           34 :                         QString imgClassValue = xml.attributes().value("class").toString();
     209           17 :                         bool isSmiley = imgClassValue.contains("wp-smiley");
     210           17 :                         if (isSmiley) {
     211            2 :                             writer.writeAttribute(S_WIDTH, "16");
     212            2 :                             writer.writeAttribute(S_HEIGHT, "16");
     213            2 :                             writer.writeAttribute("data-smiley", "1");
     214              :                         }
     215              : 
     216              :                         // Check for tracking pixels using HTML dimensions.
     217           17 :                         if (!isSmiley) {
     218           30 :                             QString sWidth = xml.attributes().value(S_WIDTH).toString();
     219           30 :                             QString sHeight = xml.attributes().value(S_HEIGHT).toString();
     220              : 
     221              :                             bool widthOK, heightOK;
     222           15 :                             int width = sWidth.toInt(&widthOK);
     223           15 :                             int height = sHeight.toInt(&heightOK);
     224              : 
     225           15 :                             if (widthOK && heightOK) {
     226           10 :                                 if (width < 3 || height < 3) {
     227              :                                     // Delete tiny images (tracking pixels).
     228            3 :                                     idsToDelete << intToID(currentId);
     229              :                                 } else {
     230              :                                     // Pass dimensions as metadata for finalize() to use
     231              :                                     // when the image fetch fails and we need to verify
     232              :                                     // this isn't a tracking pixel.
     233           14 :                                     writer.writeAttribute(S_WIDTH, sWidth);
     234           14 :                                     writer.writeAttribute(S_HEIGHT, sHeight);
     235              :                                 }
     236              :                             }
     237           15 :                         }
     238              : 
     239              :                         // Fetch images for caching and dimension verification.
     240           17 :                         if (!idsToDelete.contains(intToID(currentId))) {
     241           14 :                             imageURLs << imgSrc;
     242              :                         }
     243           17 :                     }
     244              : 
     245          259 :                     lastWasText = false;
     246              :                 }
     247          332 :             } else {
     248          109 :                  skip++;
     249              :             }
     250         1226 :         } else if (xml.isEndElement()) {
     251          441 :             QString tagName = xml.name().toString().toLower();
     252              : 
     253              :             // End
     254          441 :             if (0 == skip) {
     255          259 :                 writer.writeEndElement();
     256              : 
     257              :                 // Pop our node and investigate.
     258          259 :                 DOMNode dom = stack.pop();
     259              : 
     260          259 :                 if (tagName == "pre") {
     261            5 :                     preDepth--;
     262              :                 }
     263              : 
     264              :                 // If it's a container and we didn't write any text, then delete this tag in the
     265              :                 // second pass.
     266          259 :                 if (containerTags.contains(tagName) && dom.nonEmptyTextCount == 0 && dom.numChildren == 0) {
     267              :                     //
     268              :                     // This doesn't work -- at the very least the IDs are wrong. We need to
     269              :                     // employ a stack here.
     270              :                     //
     271            6 :                     idsToDelete << intToID(dom.intID);
     272              :                 }
     273              : 
     274          259 :                 lastWasText = false;
     275          259 :             } else {
     276          182 :                 skip--;
     277              :             }
     278         1226 :         } else if (xml.isCharacters() && 0 == skip) {
     279              :             // Text
     280          454 :             QString text = xml.text().toString();
     281          454 :             bool isEmpty = isHTMLEmpty(text);
     282              : 
     283              :             // Don't allow pure empty tags, though a single space is ok.
     284          454 :             if (!isEmpty || text == " ") {
     285          124 :                 if (preDepth == 0) {
     286          118 :                     bool addSpaceStart = text.startsWith('\n');
     287          118 :                     bool addSpaceEnd = text.endsWith('\n');
     288              : 
     289              :                     // Text can start or end with a newline -- delete 'em.
     290          118 :                     removeNewlinesBothSides(text);
     291              : 
     292              :                     // Add back extra spaces so text doesn'truntogether.
     293          118 :                     if (addSpaceStart) {
     294           12 :                         text = ' ' + text;
     295              :                     }
     296              : 
     297          118 :                     if (addSpaceEnd) {
     298            7 :                         text = text + ' ';
     299              :                     }
     300              :                 }
     301              : 
     302              :                 // Write the text!
     303          124 :                 writer.writeCharacters(text);
     304              : 
     305          124 :                 if (!isEmpty) {
     306          117 :                     stack.top().nonEmptyTextCount++;
     307              :                 }
     308              : 
     309          124 :                 lastWasText = true;
     310              :             }
     311          785 :         } else if (xml.isEntityReference() && 0 == skip) {
     312              :             // Entity
     313            0 :             QString entity = xml.name().toString();
     314            0 :             writer.writeEntityReference(entity);
     315          331 :         } else if (xml.isStartDocument()) {
     316              :             // Doc start
     317           53 :             writer.writeStartDocument("1.0");
     318          278 :         } else if (xml.isEndDocument()) {
     319              :             // Doc end
     320           53 :             writer.writeEndElement();
     321              :         }
     322              :     }
     323              : 
     324           53 :     if (xml.hasError()) {
     325            0 :         qCDebug(logRewriter) << "Error reading XML: " << xml.errorString();
     326              :     }
     327              : 
     328           53 :     if (writer.hasError()) {
     329            0 :         qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
     330              :     }
     331              : 
     332              : 
     333           58 :     if (tagCount <= 5 && output !=
     334           58 :             "<?xml version=\"1.0\"?><html id=\"FangID_1\"><body id=\"FangID_2\"/></html>") {
     335              :         // Turns out we're not dealing with an HTML document: there's not enough tags, and it's
     336              :         // not an empty document (which can be caused by bad HTML.)
     337              :         // Ditch the Tidy'd doc and rewrite as plain text from the original.
     338            4 :         return textToHtml(document);
     339              :     }
     340              : 
     341              :     // Return new document.
     342           49 :     return output;
     343           53 : }
     344              : 
     345          118 : void HTMLSanitizer::removeNewlinesBothSides(QString &docString)
     346              : {
     347          130 :     while (docString.startsWith("\n")) {
     348           12 :         docString = docString.mid(1);
     349              :     }
     350              : 
     351          125 :     while (docString.endsWith("\n")) {
     352            7 :         docString = docString.left(docString.length() - 1);
     353              :     }
     354          118 : }
     355              : 
     356            6 : QString HTMLSanitizer::textToHtml(const QString& input)
     357              : {
     358            6 :     QString output;
     359              : 
     360              :     // Keep it simple, stupid.
     361            6 :     QString cleaned = input.trimmed();
     362            6 :     cleaned.replace("\r\n", "\r");
     363            6 :     cleaned.replace("\r", "\n");
     364              : 
     365            6 :     QStringList list = cleaned.split('\n', Qt::SkipEmptyParts);
     366           17 :     for (const QString& line : list) {
     367              :         // Trim lines, and skip empty ones.
     368           11 :         QString trimmed = line.trimmed();
     369           11 :         if (!trimmed.isEmpty()) {
     370           10 :             output += "<p>" + trimmed + "</p>";
     371              :         }
     372           11 :     }
     373              : 
     374              :     // As a signal to the 2nd pass, we prepend the output with an ASCII beep character. 2nd pass
     375              :     // will remove this and return the string without further modification.
     376            6 :     output = '\07' + output;
     377              : 
     378           12 :     return output;
     379            6 : }
     380              : 
     381           42 : QString HTMLSanitizer::finalize(const QString &html, const QMap<QUrl, ImageData> &imageResults)
     382              : {
     383              :     // If it was a text-only document, we've prepended it with an ASCII beep. All we have to do
     384              :     // here is remove the beep and return it.
     385           42 :     if (html.startsWith('\07')) {
     386            3 :         return html.mid(1);
     387              :     }
     388              : 
     389           39 :     QXmlStreamReader xml;
     390           39 :     xml.addData(html);
     391              : 
     392           39 :     QString output;
     393           39 :     QXmlStreamWriter writer(&output);
     394           39 :     writer.setAutoFormatting(false);
     395           39 :     int skip = 0; // Skip stack.
     396           39 :     int preDepth = 0; // Track nesting depth inside <pre> to preserve whitespace.
     397           39 :     QString lastTag = "";
     398              : 
     399          658 :     while (!xml.atEnd()) {
     400              :         // Grab the next thingie.
     401          619 :         xml.readNext();
     402              : 
     403          619 :         if (xml.isStartElement()) {
     404          218 :             if (0 == skip) {
     405              :                 // Start
     406          218 :                 QString tagName = xml.name().toString().toLower();
     407          436 :                 QString id = xml.attributes().value(S_ID).toString();
     408              : 
     409          218 :                 if (idsToDelete.contains(id)) {
     410              :                     // We need to delete this tag! Skip it.
     411            8 :                     skip = 1;
     412          210 :                 } else if (tagName == S_IMG) {
     413           36 :                     QString url = xml.attributes().value(S_SRC).toString();
     414           18 :                     QString srcToUse = url;
     415           18 :                     bool keepImage = false;
     416           36 :                     bool isSmiley = xml.attributes().value("data-smiley").toString() == "1";
     417              : 
     418           18 :                     int width = 0;
     419           18 :                     int height = 0;
     420              : 
     421           18 :                     ImageData imageData = imageResults.value(url);
     422           18 :                     if (imageData.isValid()) {
     423           14 :                         if (isSmiley) {
     424              :                             // WordPress emoji: use the small dimensions from
     425              :                             // sanitize() instead of the fetched pixel size.
     426            4 :                             width = xml.attributes().value(S_WIDTH).toInt();
     427            4 :                             height = xml.attributes().value(S_HEIGHT).toInt();
     428              :                         } else {
     429           12 :                             width = imageData.image.width();
     430           12 :                             height = imageData.image.height();
     431              :                         }
     432              : 
     433           14 :                         if (width > 2 && height > 2) {
     434           11 :                             QString cachedPath = "/images/" + QImageCache::saveImage(url, imageData);
     435           11 :                             if (!cachedPath.isEmpty()) {
     436           11 :                                 srcToUse = cachedPath;
     437              :                             }
     438           11 :                             keepImage = true;
     439           11 :                         }
     440           16 :                     } else if (xml.attributes().hasAttribute(S_WIDTH) &&
     441            8 :                                xml.attributes().hasAttribute(S_HEIGHT)) {
     442              :                         // Fetch failed but image has known good dimensions from
     443              :                         // sanitize() - keep it with the original URL.
     444            4 :                         width = xml.attributes().value(S_WIDTH).toInt();
     445            4 :                         height = xml.attributes().value(S_HEIGHT).toInt();
     446            2 :                         keepImage = true;
     447              :                     }
     448              :                     // else: fetch failed and no known dimensions - skip.
     449              :                     // Could be a tracking pixel we can't verify.
     450              : 
     451           18 :                     if (keepImage) {
     452           13 :                         writer.writeStartElement(tagName);
     453           26 :                         writer.writeAttribute(S_SRC, srcToUse);
     454           13 :                         if (width > 0 && height > 0) {
     455           26 :                             writer.writeAttribute(S_WIDTH, QString::number(width));
     456           26 :                             writer.writeAttribute(S_HEIGHT, QString::number(height));
     457              :                         }
     458           13 :                         if (isSmiley) {
     459            2 :                             writer.writeAttribute("class", "smiley");
     460              :                         }
     461           13 :                         if (srcToUse != url) {
     462           22 :                             writer.writeAttribute("data-original-src", url);
     463              :                         }
     464           13 :                         lastTag = tagName;
     465              :                     } else {
     466            5 :                         skip = 1;
     467              :                     }
     468           18 :                 } else {
     469              :                     // Write the tag and all attributes (except for ID)
     470          192 :                     writer.writeStartElement(tagName);
     471          407 :                     for (const QXmlStreamAttribute& attribute : xml.attributes()) {
     472          215 :                         if (attribute.name().toString() != S_ID) {
     473           23 :                             writer.writeAttribute(attribute);
     474              :                         }
     475          192 :                     }
     476              : 
     477          192 :                     if (tagName == "pre") {
     478            4 :                         preDepth++;
     479              :                     }
     480              : 
     481          192 :                     lastTag = tagName;
     482              :                 }
     483          218 :             } else {
     484            0 :                 skip++;
     485              :             }
     486          401 :         } else if (xml.isEndElement()) {
     487              :             // End
     488          218 :             if (0 == skip) {
     489          205 :                 if (xml.name().toString().toLower() == "pre") {
     490            4 :                     preDepth--;
     491              :                 }
     492          205 :                 writer.writeEndElement();
     493              :             } else {
     494           13 :                 skip--;
     495              :             }
     496          183 :         } else if (xml.isCharacters() && 0 == skip) {
     497              :             // Text
     498          101 :             QString text = xml.text().toString();
     499          101 :             if (preDepth == 0) {
     500              :                 // Outside preformatted blocks, collapse newlines to spaces.
     501           96 :                 text.replace("\n", " ");
     502              :             }
     503              : 
     504          101 :             writer.writeCharacters(text);
     505          101 :             lastTag = "#text";
     506          183 :         } else if (xml.isEntityReference() && 0 == skip) {
     507              :             // Entity
     508            0 :             QString entity = xml.name().toString();
     509            0 :             writer.writeEntityReference(entity);
     510            0 :             lastTag = "#entity";
     511           82 :         } else if (xml.isStartDocument()) {
     512              :             // Doc start
     513           39 :             writer.writeStartDocument(xml.documentVersion().toString());
     514           43 :         } else if (xml.isEndDocument()) {
     515              :             // Doc end;xml.documentVersion()
     516           39 :             writer.writeEndElement();
     517              :         }
     518              :     }
     519              : 
     520           39 :     if (xml.hasError()) {
     521            0 :         qCDebug(logRewriter) << "QXmlStreamReader had error: " << xml.errorString();
     522              :     }
     523              : 
     524           39 :     if (writer.hasError()) {
     525            0 :         qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
     526              :     }
     527              : 
     528              :     // Post-process and return.
     529           39 :     postProcessDocString(output);
     530           39 :     return output;
     531           39 : }
     532              : 
     533           39 : void HTMLSanitizer::postProcessDocString(QString &docString)
     534              : {
     535              :     // The R is for Redundant!
     536           39 :     docString.replace("\r", "");
     537              : 
     538              :     // Rip out headers/footers.
     539           39 :     docString.replace("<?xml version=\"1.0\"?><html>", "");
     540           39 :     docString.replace("<body>", "");
     541           39 :     docString.replace("</body></html>", "");
     542           39 :     docString.replace("<body/></html>", ""); // Empty body!
     543              : 
     544              :     // This happens.
     545           39 :     docString = docString.trimmed();
     546           39 : }
        

Generated by: LCOV version 2.0-1