LCOV - code coverage report
Current view: top level - src/utilities - HTMLSanitizer.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 95.4 % 261 249
Test Date: 2026-03-23 10:19:47 Functions: 100.0 % 10 10

            Line data    Source code
       1              : #include "HTMLSanitizer.h"
       2              : #include "FangLogging.h"
       3              : 
       4              : #include <QXmlStreamReader>
       5              : #include <QXmlStreamWriter>
       6              : #include <QStack>
       7              : 
       8              : #include "ImageCache.h"
       9              : #include "NetworkUtilities.h"
      10              : 
      11              : // Strings.
      12              : #define S_WIDTH "width"
      13              : #define S_HEIGHT "height"
      14              : #define S_SRC "src"
      15              : #define S_IMG "img"
      16              : #define S_HREF "href"
      17              : #define S_ID "id"
      18              : 
      19           38 : HTMLSanitizer::HTMLSanitizer(QObject *parent) :
      20              :     FangObject(parent),
      21           38 :     webPageGrabber(false),
      22           38 :     currentId(0)
      23              : {
      24           38 :     tagsToRemove << "script"    // Javascript
      25           76 :                  << "title"     // Titles WTF?
      26           76 :                  << "head"      // Don't need head
      27           76 :                  << "style"     // Custom styles.
      28           76 :                  << "iframe"    // Iframes!
      29           76 :                  << "object"    // Plugins!
      30           76 :                  << "embed"     // Other plugins!
      31           38 :                  << "hr";       // No horizontals allowed; they're ugly.
      32              : 
      33           38 :     classesToRemove << "feedflare"                  // Feedburger's 37 pieces of flare
      34           76 :                     << "mf-viral"                   // Motherfucking viral?
      35           38 :                     << "service-links-stumbleupon"; // StubbleUponYourFace
      36              : 
      37           38 :     shareButtonURLs << "twitter.com/home?status"
      38           76 :                     << "plus.google.com/shar"
      39           76 :                     << "facebook.com/shar"
      40           76 :                     << "feedsportal.com/"
      41           76 :                     << "api.tweetmeme.com/"
      42           76 :                     << "stumbleupon.com/submit"
      43           38 :                     << "share.feedsportal.com/share";
      44              : 
      45           38 :      containerTags << "p"
      46           76 :                    << "div"
      47           76 :                    << "span"
      48           38 :                    << "pre";
      49           38 : }
      50              : 
      51           31 : void HTMLSanitizer::reset()
      52              : {
      53           31 :     idsToDelete.clear();
      54           31 :     currentId = 0;
      55           31 : }
      56              : 
      57          288 : bool HTMLSanitizer::isHTMLEmpty(const QString& html)
      58              : {
      59          288 :     QString copy = html;
      60          288 :     copy.replace(" ", "");
      61          288 :     copy.replace("\t", "");
      62          288 :     copy.replace("\n", "");
      63              : 
      64          576 :     return copy.size() == 0;
      65          288 : }
      66              : 
      67           24 : bool HTMLSanitizer::isShareURL(const QString &url)
      68              : {
      69          187 :     for (const QString& shareURL : shareButtonURLs) {
      70          164 :         if (url.contains(shareURL, Qt::CaseInsensitive)) {
      71            1 :             return true;
      72              :         }
      73              :     }
      74              : 
      75           23 :     return false;
      76              : }
      77              : 
      78          190 : QString HTMLSanitizer::intToID(int id)
      79              : {
      80          190 :     return "FangID_" + QString::number(id);
      81              : }
      82              : 
      83           30 : QString HTMLSanitizer::sanitize(const QString &document, QSet<QUrl> &imageURLs)
      84              : {
      85              :     // We use TidyLib via WebPageGrabber to convert the (potentially crappy) HTML into proper
      86              :     // XHTML.  This will add a doctype and other unwanted headers/footers, so we strip those
      87              :     // out in a separate post-processing method.  You'll see.
      88           30 :     QString* doc = webPageGrabber.load("<html><body>" + document + "</body></html>");
      89           30 :     if (doc == nullptr) {
      90            0 :         qCDebug(logRewriter) << "Error loading HTML document";
      91              : 
      92            0 :         return "";
      93              :     }
      94              : 
      95              :     // Swap out non-breaking spaces here since QXmlStreamReader doesn't handle them well.
      96           30 :     doc->replace("&nbsp;", " ", Qt::CaseInsensitive);
      97              : 
      98              :     // We're going to count the number of tags to determine if this is a real HTML document,
      99              :     // or a text document.
     100           30 :     int tagCount = 0;
     101              : 
     102           30 :     QXmlStreamReader xml;
     103           30 :     xml.addData(*doc);
     104              : 
     105           30 :     QString output;
     106           30 :     QXmlStreamWriter writer(&output);
     107           30 :     writer.setAutoFormatting(false);
     108              : 
     109              :     // If we're skipping elements, this is >= 1
     110           30 :     int skip = 0;
     111              : 
     112              :     // Current stack.
     113           30 :     QStack<DOMNode> stack;
     114              : 
     115              :     // Was the last node text?
     116           30 :     bool lastWasText = false;
     117              : 
     118              :     // Track nesting depth inside <pre> to preserve whitespace.
     119           30 :     int preDepth = 0;
     120              : 
     121         1052 :     while (!xml.atEnd()) {
     122              :         // Grab the next thingie.
     123         1022 :         xml.readNext();
     124              : 
     125         1022 :         if (xml.isStartElement()) {
     126              :             // Start
     127          275 :             tagCount++;
     128              : 
     129          275 :             if (0 == skip) {
     130          214 :                 QString tagName = xml.name().toString().toLower();
     131          428 :                 QString classValue = xml.attributes().value("class").toString();
     132          428 :                 QString href = xml.attributes().value(S_HREF).toString();
     133              : 
     134          394 :                 if (tagsToRemove.contains(tagName) ||
     135          360 :                         classesToRemove.contains(classValue) || // Delete known bad classes
     136          603 :                         (tagName == "a" && isShareURL(href)) || // Delete share links
     137          399 :                         (tagName == "br" && !lastWasText)) {    // Delete br's that weren't preceeded by text.
     138              :                     // Skip it good!
     139           40 :                     skip = 1;
     140              :                 } else {
     141              :                     // Write the tag.
     142          174 :                     writer.writeStartElement(tagName);
     143              : 
     144          174 :                     currentId++;
     145          348 :                     writer.writeAttribute(S_ID, intToID(currentId));
     146              : 
     147              :                     // If there's a parent node, add a child.
     148          174 :                     if (stack.size()) {
     149          144 :                         stack.top().numChildren++;
     150              :                     }
     151              : 
     152              :                     // Push it.
     153          174 :                     stack.push(DOMNode(tagName, currentId));
     154              : 
     155          174 :                     if (tagName == "pre") {
     156            4 :                         preDepth++;
     157              :                     }
     158              : 
     159              :                     // Anchor tags.
     160          197 :                     if (tagName == "a" && xml.attributes().hasAttribute(S_HREF)) {
     161           69 :                         writer.writeAttribute(S_HREF, xml.attributes().value(S_HREF).toString());
     162              :                     }
     163              : 
     164              :                     // Image tags.
     165          184 :                     if (tagName == S_IMG && xml.attributes().hasAttribute(S_SRC)) {
     166           20 :                         QString imgSrc = NetworkUtilities::urlFixup(xml.attributes().value(S_SRC).toString());
     167           20 :                         writer.writeAttribute(S_SRC, imgSrc);
     168              : 
     169              :                         // Check for tracking pixels using HTML dimensions.
     170           20 :                         QString sWidth = xml.attributes().value(S_WIDTH).toString();
     171           20 :                         QString sHeight = xml.attributes().value(S_HEIGHT).toString();
     172              : 
     173              :                         bool widthOK, heightOK;
     174           10 :                         int width = sWidth.toInt(&widthOK);
     175           10 :                         int height = sHeight.toInt(&heightOK);
     176              : 
     177           10 :                         if (widthOK && heightOK) {
     178            7 :                             if (width < 3 || height < 3) {
     179              :                                 // Delete tiny images (tracking pixels).
     180            1 :                                 idsToDelete << intToID(currentId);
     181              :                             } else {
     182              :                                 // Pass dimensions as metadata for finalize() to use
     183              :                                 // when the image fetch fails and we need to verify
     184              :                                 // this isn't a tracking pixel.
     185           12 :                                 writer.writeAttribute(S_WIDTH, sWidth);
     186           12 :                                 writer.writeAttribute(S_HEIGHT, sHeight);
     187              :                             }
     188              :                         }
     189              : 
     190              :                         // Fetch images for caching and dimension verification.
     191           10 :                         if (!idsToDelete.contains(intToID(currentId))) {
     192            9 :                             imageURLs << imgSrc;
     193              :                         }
     194           10 :                     }
     195              :                 }
     196              : 
     197          214 :                 lastWasText = false;
     198          214 :             } else {
     199           61 :                  skip++;
     200              :             }
     201          747 :         } else if (xml.isEndElement()) {
     202          275 :             QString tagName = xml.name().toString().toLower();
     203              : 
     204              :             // End
     205          275 :             if (0 == skip) {
     206          174 :                 writer.writeEndElement();
     207              : 
     208              :                 // Pop our node and investigate.
     209          174 :                 DOMNode dom = stack.pop();
     210              : 
     211          174 :                 if (tagName == "pre") {
     212            4 :                     preDepth--;
     213              :                 }
     214              : 
     215              :                 // If it's a container and we didn't write any text, then delete this tag in the
     216              :                 // second pass.
     217          174 :                 if (containerTags.contains(tagName) && dom.nonEmptyTextCount == 0 && dom.numChildren == 0) {
     218              :                     //
     219              :                     // This doesn't work -- at the very least the IDs are wrong.  We need to
     220              :                     // employ a stack here.
     221              :                     //
     222            5 :                     idsToDelete << intToID(dom.intID);
     223              :                 }
     224              : 
     225          174 :                 lastWasText = false;
     226          174 :             } else {
     227          101 :                 skip--;
     228              :             }
     229          747 :         } else if (xml.isCharacters() && 0 == skip) {
     230              :             // Text
     231          288 :             QString text = xml.text().toString();
     232          288 :             bool isEmpty = isHTMLEmpty(text);
     233              : 
     234              :             // Don't allow pure empty tags, though a single space is ok.
     235          288 :             if (!isEmpty || text == " ") {
     236           93 :                 if (preDepth == 0) {
     237           88 :                     bool addSpaceStart = text.startsWith('\n');
     238           88 :                     bool addSpaceEnd = text.endsWith('\n');
     239              : 
     240              :                     // Text can start or end with a newline -- delete 'em.
     241           88 :                     removeNewlinesBothSides(text);
     242              : 
     243              :                     // Add back extra spaces so text doesn'truntogether.
     244           88 :                     if (addSpaceStart) {
     245            9 :                         text = ' ' + text;
     246              :                     }
     247              : 
     248           88 :                     if (addSpaceEnd) {
     249            6 :                         text = text + ' ';
     250              :                     }
     251              :                 }
     252              : 
     253              :                 // Write the text!
     254           93 :                 writer.writeCharacters(text);
     255              : 
     256           93 :                 if (!isEmpty) {
     257           86 :                     stack.top().nonEmptyTextCount++;
     258              :                 }
     259              : 
     260           93 :                 lastWasText = true;
     261              :             }
     262          472 :         } else if (xml.isEntityReference() && 0 == skip) {
     263              :             // Entity
     264            0 :             QString entity = xml.name().toString();
     265            0 :             writer.writeEntityReference(entity);
     266          184 :         } else if (xml.isStartDocument()) {
     267              :             // Doc start
     268           30 :             writer.writeStartDocument("1.0");
     269          154 :         } else if (xml.isEndDocument()) {
     270              :             // Doc end
     271           30 :             writer.writeEndElement();
     272              :         }
     273              :     }
     274              : 
     275           30 :     if (xml.hasError()) {
     276            0 :         qCDebug(logRewriter) << "Error reading XML: " << xml.errorString();
     277              :     }
     278              : 
     279           30 :     if (writer.hasError()) {
     280            0 :         qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
     281              :     }
     282              : 
     283              : 
     284           34 :     if (tagCount <= 5 && output !=
     285           34 :             "<?xml version=\"1.0\"?><html id=\"FangID_1\"><body id=\"FangID_2\"/></html>") {
     286              :         // Turns out we're not dealing with an HTML document: there's not enough tags, and it's
     287              :         // not an empty document (which can be caused by bad HTML.)
     288              :         // Ditch the Tidy'd doc and rewrite as plain text from the original.
     289            3 :         return textToHtml(document);
     290              :     }
     291              : 
     292              :     // Return new document.
     293           27 :     return output;
     294           30 : }
     295              : 
     296           37 : QString HTMLSanitizer::finalize(const QString &html, const QMap<QUrl, ImageData> &imageResults)
     297              : {
     298              :     // If it was a text-only document, we've prepended it with an ASCII beep.  All we have to do
     299              :     // here is remove the beep and return it.
     300           37 :     if (html.startsWith('\07')) {
     301            3 :         return html.mid(1);
     302              :     }
     303              : 
     304           34 :     QXmlStreamReader xml;
     305           34 :     xml.addData(html);
     306              : 
     307           34 :     QString output;
     308           34 :     QXmlStreamWriter writer(&output);
     309           34 :     writer.setAutoFormatting(false);
     310           34 :     int skip = 0; // Skip stack.
     311           34 :     int preDepth = 0; // Track nesting depth inside <pre> to preserve whitespace.
     312           34 :     QString lastTag = "";
     313              : 
     314          582 :     while (!xml.atEnd()) {
     315              :         // Grab the next thingie.
     316          548 :         xml.readNext();
     317              : 
     318          548 :         if (xml.isStartElement()) {
     319          193 :             if (0 == skip) {
     320              :                 // Start
     321          193 :                 QString tagName = xml.name().toString().toLower();
     322          386 :                 QString id = xml.attributes().value(S_ID).toString();
     323              : 
     324          193 :                 if (idsToDelete.contains(id)) {
     325              :                     // We need to delete this tag! Skip it.
     326            6 :                     skip = 1;
     327          187 :                 } else if (tagName == S_IMG) {
     328           32 :                     QString url = xml.attributes().value(S_SRC).toString();
     329           16 :                     QString srcToUse = url;
     330           16 :                     bool keepImage = false;
     331              : 
     332           16 :                     int width = 0;
     333           16 :                     int height = 0;
     334              : 
     335           16 :                     ImageData imageData = imageResults.value(url);
     336           16 :                     if (imageData.isValid()) {
     337           12 :                         width = imageData.image.width();
     338           12 :                         height = imageData.image.height();
     339              : 
     340           12 :                         if (width > 2 && height > 2) {
     341            9 :                             QString cachedPath = ImageCache::saveImage(url, imageData);
     342            9 :                             if (!cachedPath.isEmpty()) {
     343            9 :                                 srcToUse = cachedPath;
     344              :                             }
     345            9 :                             keepImage = true;
     346            9 :                         }
     347           16 :                     } else if (xml.attributes().hasAttribute(S_WIDTH) &&
     348            8 :                                xml.attributes().hasAttribute(S_HEIGHT)) {
     349              :                         // Fetch failed but image has known good dimensions from
     350              :                         // sanitize() - keep it with the original URL.
     351            4 :                         width = xml.attributes().value(S_WIDTH).toInt();
     352            4 :                         height = xml.attributes().value(S_HEIGHT).toInt();
     353            2 :                         keepImage = true;
     354              :                     }
     355              :                     // else: fetch failed and no known dimensions - skip.
     356              :                     // Could be a tracking pixel we can't verify.
     357              : 
     358           16 :                     if (keepImage) {
     359           11 :                         writer.writeStartElement(tagName);
     360           22 :                         writer.writeAttribute(S_SRC, srcToUse);
     361           11 :                         if (width > 0 && height > 0) {
     362           22 :                             writer.writeAttribute(S_WIDTH, QString::number(width));
     363           22 :                             writer.writeAttribute(S_HEIGHT, QString::number(height));
     364              :                         }
     365           11 :                         if (srcToUse != url) {
     366           18 :                             writer.writeAttribute("data-original-src", url);
     367              :                         }
     368           11 :                         lastTag = tagName;
     369              :                     } else {
     370            5 :                         skip = 1;
     371              :                     }
     372           16 :                 } else {
     373              :                     // Write the tag and all attributes (except for ID)
     374          171 :                     writer.writeStartElement(tagName);
     375          365 :                     for (const QXmlStreamAttribute& attribute : xml.attributes()) {
     376          194 :                         if (attribute.name().toString() != S_ID) {
     377           23 :                             writer.writeAttribute(attribute);
     378              :                         }
     379          171 :                     }
     380              : 
     381          171 :                     if (tagName == "pre") {
     382            4 :                         preDepth++;
     383              :                     }
     384              : 
     385          171 :                     lastTag = tagName;
     386              :                 }
     387          193 :             } else {
     388            0 :                 skip++;
     389              :             }
     390          355 :         } else if (xml.isEndElement()) {
     391              :             // End
     392          193 :             if (0 == skip) {
     393          182 :                 if (xml.name().toString().toLower() == "pre") {
     394            4 :                     preDepth--;
     395              :                 }
     396          182 :                 writer.writeEndElement();
     397              :             } else {
     398           11 :                 skip--;
     399              :             }
     400          162 :         } else if (xml.isCharacters() && 0 == skip) {
     401              :             // Text
     402           90 :             QString text = xml.text().toString();
     403           90 :             if (preDepth == 0) {
     404              :                 // Outside preformatted blocks, collapse newlines to spaces.
     405           85 :                 text.replace("\n", " ");
     406              :             }
     407              : 
     408           90 :             writer.writeCharacters(text);
     409           90 :             lastTag = "#text";
     410          162 :         } else if (xml.isEntityReference() && 0 == skip) {
     411              :             // Entity
     412            0 :             QString entity = xml.name().toString();
     413            0 :             writer.writeEntityReference(entity);
     414            0 :             lastTag = "#entity";
     415           72 :         } else if (xml.isStartDocument()) {
     416              :             // Doc start
     417           34 :             writer.writeStartDocument(xml.documentVersion().toString());
     418           38 :         } else if (xml.isEndDocument()) {
     419              :             // Doc end;xml.documentVersion()
     420           34 :             writer.writeEndElement();
     421              :         }
     422              :     }
     423              : 
     424           34 :     if (xml.hasError()) {
     425            0 :         qCDebug(logRewriter) << "QXmlStreamReader had error: " << xml.errorString();
     426              :     }
     427              : 
     428           34 :     if (writer.hasError()) {
     429            0 :         qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
     430              :     }
     431              : 
     432              :     // Post-process and return.
     433           34 :     postProcessDocString(output);
     434           34 :     return output;
     435           34 : }
     436              : 
     437           34 : void HTMLSanitizer::postProcessDocString(QString &docString)
     438              : {
     439              :     // The R is for Redundant!
     440           34 :     docString.replace("\r", "");
     441              : 
     442              :     // Rip out headers/footers.
     443           34 :     docString.replace("<?xml version=\"1.0\"?><html>", "");
     444           34 :     docString.replace("<body>", "");
     445           34 :     docString.replace("</body></html>", "");
     446           34 :     docString.replace("<body/></html>", ""); // Empty body!
     447              : 
     448              :     // This happens.
     449           34 :     docString = docString.trimmed();
     450           34 : }
     451              : 
     452           88 : void HTMLSanitizer::removeNewlinesBothSides(QString &docString)
     453              : {
     454           97 :     while (docString.startsWith("\n")) {
     455            9 :         docString = docString.mid(1);
     456              :     }
     457              : 
     458           94 :     while (docString.endsWith("\n")) {
     459            6 :         docString = docString.left(docString.length() - 1);
     460              :     }
     461           88 : }
     462              : 
     463            3 : QString HTMLSanitizer::textToHtml(const QString& input)
     464              : {
     465            3 :     QString output;
     466              : 
     467              :     // Keep it simple, stupid.
     468            3 :     QString cleaned = input.trimmed();
     469            3 :     cleaned.replace("\r\n", "\r");
     470            3 :     cleaned.replace("\r", "\n");
     471              : 
     472            3 :     QStringList list = cleaned.split('\n', Qt::SkipEmptyParts);
     473            9 :     for (const QString& line : list) {
     474              :         // Trim lines, and skip empty ones.
     475            6 :         QString trimmed = line.trimmed();
     476            6 :         if (!trimmed.isEmpty()) {
     477            5 :             output += "<p>" + trimmed + "</p>";
     478              :         }
     479            6 :     }
     480              : 
     481              :     // As a signal to the 2nd pass, we prepend the output with an ASCII beep character.  2nd pass
     482              :     // will remove this and return the string without further modification.
     483            3 :     output = '\07' + output;
     484              : 
     485            6 :     return output;
     486            3 : }
     487              : 
        

Generated by: LCOV version 2.0-1