LCOV - code coverage report
Current view: top level - src/utilities - RawFeedRewriter.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 94.6 % 280 265
Test Date: 2026-01-27 22:31:25 Functions: 100.0 % 14 14

            Line data    Source code
       1              : #include "RawFeedRewriter.h"
       2              : 
       3              : #include <QXmlStreamReader>
       4              : #include <QXmlStreamWriter>
       5              : #include <QString>
       6              : #include <QStack>
       7              : #include <QDebug>
       8              : 
       9              : #include "NetworkUtilities.h"
      10              : 
      11              : // Image width max.
      12              : #define MAX_ELEMENT_WIDTH 400
      13              : 
      14              : // Strings.
      15              : #define S_WIDTH "width"
      16              : #define S_HEIGHT "height"
      17              : #define S_SRC "src"
      18              : #define S_IMG "img"
      19              : #define S_HREF "href"
      20              : #define S_ID "id"
      21              : 
      22           27 : RawFeedRewriter::RawFeedRewriter(QObject *parent) :
      23              :     FangObject(parent),
      24           27 :     newsList(nullptr),
      25           27 :     webPageGrabber(false),
      26           27 :     imageGrabber(),
      27           54 :     intID(0)
      28              : {
      29           27 :     connect(&imageGrabber, &ImageGrabber::finished, this, &RawFeedRewriter::onImageGrabberFinished);
      30              : 
      31           27 :     tagsToRemove << "script"    // Javascript
      32           54 :                  << "title"     // Titles WTF?
      33           54 :                  << "head"      // Don't need head
      34           54 :                  << "style"     // Custom styles.
      35           54 :                  << "iframe"    // Iframes!
      36           54 :                  << "object"    // Plugins!
      37           54 :                  << "embed"     // Other plugins!
      38           27 :                  << "hr";       // No horizontals allowed; they're ugly.
      39              : 
      40           27 :     classesToRemove << "feedflare"                  // Feedburger's 37 pieces of flare
      41           54 :                     << "mf-viral"                   // Motherfucking viral?
      42           27 :                     << "service-links-stumbleupon"; // StubbleUponYourFace
      43              : 
      44           27 :     shareButtonURLs << "twitter.com/home?status"
      45           54 :                     << "plus.google.com/shar"
      46           54 :                     << "facebook.com/shar"
      47           54 :                     << "feedsportal.com/"
      48           54 :                     << "api.tweetmeme.com/"
      49           54 :                     << "stumbleupon.com/submit"
      50           27 :                     << "share.feedsportal.com/share";
      51              : 
      52           27 :      containerTags << "p"
      53           54 :                    << "div"
      54           54 :                    << "span"
      55           27 :                    << "pre";
      56           27 : }
      57              : 
      58           27 : void RawFeedRewriter::rewrite(QList<RawNews *> *newsList)
      59              : {
      60              :     // Save our news list!</protestChant>
      61           27 :     this->newsList = newsList;
      62              : 
      63           27 :     QSet<QUrl> imageURLs;
      64           27 :     idsToDelete.clear();
      65           27 :     intID = 0;
      66              : 
      67              :     // Iterate over all the news we have.
      68           54 :     for (RawNews* news : *newsList) {
      69           27 :         if (news->content.size()) {
      70            0 :             news->content = rewriteFirstPass(news->content, imageURLs);
      71              :         }
      72              : 
      73           27 :         if (news->description.size()) {
      74           26 :             news->description = rewriteFirstPass(news->description, imageURLs);
      75              :         }
      76              :     }
      77              : 
      78              :     // No images? We're done, yay!
      79           27 :     if (imageURLs.size() == 0) {
      80              :         // Gotta do this, g.
      81           24 :         rewriteAllSecondPass();
      82           24 :         postProcess();
      83              : 
      84           24 :         emit finished();
      85              : 
      86           24 :         return;
      87              :     }
      88              : 
      89              :     // Do the whole image resizing thang.
      90            3 :     imageGrabber.fetchUrls(imageURLs.values());
      91           27 : }
      92              : 
      93          258 : bool RawFeedRewriter::isHTMLEmpty(QString html)
      94              : {
      95          258 :     html.replace(" ", "");
      96          258 :     html.replace("\t", "");
      97          258 :     html.replace("\n", "");
      98              : 
      99          258 :     return html.size() == 0;
     100              : }
     101              : 
     102           24 : bool RawFeedRewriter::isShareURL(const QString &url)
     103              : {
     104          187 :     for (QString shareURL : shareButtonURLs) {
     105          164 :         if (url.contains(shareURL, Qt::CaseInsensitive)) {
     106            1 :             return true;
     107              :         }
     108          164 :     }
     109              : 
     110           23 :     return false;
     111              : }
     112              : 
     113          164 : QString RawFeedRewriter::intToID(int id)
     114              : {
     115          164 :     return "FangID_" + QString::number(id);
     116              : }
     117              : 
     118           26 : QString RawFeedRewriter::rewriteFirstPass(const QString &document, QSet<QUrl> &imageURLs)
     119              : {
     120              :     // We use TidyLib via WebPageGrabber to convert the (potentially crappy) HTML into proper
     121              :     // XHTML.  This will add a doctype and other unwanted headers/footers, so we strip those
     122              :     // out in a separate post-processing method.  You'll see.
     123           26 :     QString* doc = webPageGrabber.load("<html><body>" + document + "</body></html>");
     124           26 :     if (doc == nullptr) {
     125            0 :         qDebug() << "Error loading HTML document";
     126              : 
     127            0 :         return "";
     128              :     }
     129              : 
     130              :     // Swap out non-breaking spaces here since QXmlStreamReader doesn't handle them well.
     131           26 :     doc->replace("&nbsp;", " ", Qt::CaseInsensitive);
     132              : 
     133              :     // We're going to count the number of tags to determine if this is a real HTML document,
     134              :     // or a text document.
     135           26 :     int tagCount = 0;
     136              : 
     137           26 :     QXmlStreamReader xml;
     138           26 :     xml.addData(*doc);
     139              : 
     140           26 :     QString output;
     141           26 :     QXmlStreamWriter writer(&output);
     142           26 :     writer.setAutoFormatting(false);
     143              : 
     144              :     // If we're skipping elements, this is >= 1
     145           26 :     int skip = 0;
     146              : 
     147              :     // Current stack.
     148           26 :     QStack<DOMNode> stack;
     149              : 
     150              :     // Was the last node text?
     151           26 :     bool lastWasText = false;
     152              : 
     153          938 :     while (!xml.atEnd()) {
     154              :         // Grab the next thingie.
     155          912 :         xml.readNext();
     156              : 
     157          912 :         if (xml.isStartElement()) {
     158              :             // Start
     159          247 :             tagCount++;
     160              : 
     161          247 :             if (0 == skip) {
     162          194 :                 QString tagName = xml.name().toString().toLower();
     163          388 :                 QString classValue = xml.attributes().value("class").toString();
     164          388 :                 QString href = xml.attributes().value(S_HREF).toString();
     165              : 
     166          358 :                 if (tagsToRemove.contains(tagName) ||
     167          328 :                         classesToRemove.contains(classValue) || // Delete known bad classes
     168          551 :                         (tagName == "a" && isShareURL(href)) || // Delete share links
     169          363 :                         (tagName == "br" && !lastWasText)) {    // Delete br's that weren't preceeded by text.
     170              :                     // Skip it good!
     171           36 :                     skip = 1;
     172              :                 } else {
     173              :                     // Write the tag.
     174          158 :                     writer.writeStartElement(tagName);
     175              : 
     176          158 :                     intID++;
     177          316 :                     writer.writeAttribute(S_ID, intToID(intID));
     178              : 
     179              :                     // If there's a parent node, add a child.
     180          158 :                     if (stack.size()) {
     181          132 :                         stack.top().numChildren++;
     182              :                     }
     183              : 
     184              :                     // Push it.
     185          158 :                     stack.push(DOMNode(tagName, intID));
     186              : 
     187              :                     // Anchor tags.
     188          181 :                     if (tagName == "a" && xml.attributes().hasAttribute(S_HREF)) {
     189           69 :                         writer.writeAttribute(S_HREF, xml.attributes().value(S_HREF).toString());
     190              :                     }
     191              : 
     192              :                     // Image tags.
     193          167 :                     if (tagName == S_IMG && xml.attributes().hasAttribute(S_SRC)) {
     194           18 :                         QString imgSrc =  NetworkUtilities::urlFixup(xml.attributes().value(S_SRC).toString());
     195           18 :                         writer.writeAttribute(S_SRC, imgSrc);
     196              : 
     197           18 :                         QString sWidth = xml.attributes().value(S_WIDTH).toString();
     198           18 :                         QString sHeight = xml.attributes().value(S_HEIGHT).toString();
     199              : 
     200              :                         bool widthOK, heightOK;
     201            9 :                         int width = sWidth.toInt(&widthOK);
     202            9 :                         int height = sHeight.toInt(&heightOK);
     203              : 
     204            9 :                         if (widthOK && heightOK) {
     205            6 :                             if (width < 3 || height < 3) {
     206              :                                 // Delete tiny images.
     207            1 :                                 idsToDelete << intToID(intID);
     208              :                             } else {
     209              :                                 // Resize image if needed.
     210              :                                 int newWidth, newHeight;
     211            5 :                                 imageResize(width, height, &newWidth, &newHeight);
     212           10 :                                 writer.writeAttribute(S_WIDTH, QString::number(newWidth));
     213           10 :                                 writer.writeAttribute(S_HEIGHT, QString::number(newHeight));
     214              :                             }
     215            6 :                         } else {
     216              :                             // Dammit, we're gonna have to fetch this image!
     217            3 :                             imageURLs << imgSrc;
     218              :                         }
     219            9 :                     }
     220              :                 }
     221              : 
     222          194 :                 lastWasText = false;
     223          194 :             } else {
     224           53 :                  skip++;
     225              :             }
     226          665 :         } else if (xml.isEndElement()) {
     227          247 :             QString tagName = xml.name().toString().toLower();
     228              : 
     229              :             // End
     230          247 :             if (0 == skip) {
     231          158 :                 writer.writeEndElement();
     232              : 
     233              :                 // Pop our node and investigate.
     234          158 :                 DOMNode dom = stack.pop();
     235              : 
     236              :                 // If it's a container and we didn't write any text, then delete this tag in the
     237              :                 // second pass.
     238          158 :                 if (containerTags.contains(tagName) && dom.nonEmptyTextCount == 0 && dom.numChildren == 0) {
     239              :                     //
     240              :                     // This doesn't work -- at the very least the IDs are wrong.  We need to
     241              :                     // employ a stack here.
     242              :                     //
     243            5 :                     idsToDelete << intToID(dom.intID);
     244              :                 }
     245              : 
     246          158 :                 lastWasText = false;
     247          158 :             } else {
     248           89 :                 skip--;
     249              :             }
     250          665 :         } else if (xml.isCharacters() && 0 == skip) {
     251              :             // Text
     252          258 :             QString text = xml.text().toString();
     253          258 :             bool isEmpty = isHTMLEmpty(text);
     254              : 
     255              :             // Don't allow pure empty tags, though a single space is ok.
     256          258 :             if (!isEmpty || text == " ") {
     257           88 :                 bool addSpaceStart = text.startsWith('\n');
     258           88 :                 bool addSpaceEnd = text.endsWith('\n');
     259              : 
     260              :                 // Text can start or end with a newline -- delete 'em.
     261           88 :                 removeNewlinesBothSides(text);
     262              : 
     263              :                  // Add back extra spaces so text doesn'truntogether.
     264           88 :                 if (addSpaceStart) {
     265           10 :                     text = ' ' + text;
     266              :                 }
     267              : 
     268           88 :                 if (addSpaceEnd) {
     269            7 :                     text = text + ' ';
     270              :                 }
     271              : 
     272              :                 // Write the text!
     273           88 :                 writer.writeCharacters(text);
     274              : 
     275           88 :                 if (!isEmpty) {
     276           81 :                     stack.top().nonEmptyTextCount++;
     277              :                 }
     278              : 
     279           88 :                 lastWasText = true;
     280              :             }
     281          418 :         } else if (xml.isEntityReference() && 0 == skip) {
     282              :             // Entity
     283            0 :             QString entity = xml.name().toString();
     284            0 :             writer.writeEntityReference(entity);
     285          160 :         } else if (xml.isStartDocument()) {
     286              :             // Doc start
     287           26 :             writer.writeStartDocument("1.0");
     288          134 :         } else if (xml.isEndDocument()) {
     289              :             // Doc end
     290           26 :             writer.writeEndElement();
     291              :         }
     292              :     }
     293              : 
     294           26 :     if (xml.hasError()) {
     295            0 :         qDebug() << "Error reading XML: " << xml.errorString();
     296              :     }
     297              : 
     298           26 :     if (writer.hasError()) {
     299            0 :         qDebug() << "QXmlStreamWriter had an error of some kind.";
     300              :     }
     301              : 
     302              : 
     303           30 :     if (tagCount <= 5 && output !=
     304           30 :             "<?xml version=\"1.0\"?><html id=\"FangID_1\"><body id=\"FangID_2\"/></html>") {
     305              :         // Turns out we're not dealing with an HTML document: there's not enough tags, and it's
     306              :         // not an empty document (which can be caused by bad HTML.)
     307              :         // Ditch the Tidy'd doc and rewrite as plain text from the original.
     308            3 :         return rewriteTextOnlyNews(document);
     309              :     }
     310              : 
     311              :     // Return new document.
     312           23 :     return output;
     313           26 : }
     314              : 
     315           27 : void RawFeedRewriter::rewriteAllSecondPass()
     316              : {
     317              :     // Iterate over all the news... again!
     318           54 :     for (RawNews* news : *newsList) {
     319           27 :         if (news->content.size()) {
     320            0 :             news->content = rewriteSecondPass(news->content);
     321              :         }
     322              : 
     323           27 :         if (news->description.size()) {
     324           26 :             news->description = rewriteSecondPass(news->description);
     325              :         }
     326              :     }
     327           27 : }
     328              : 
     329           26 : QString RawFeedRewriter::rewriteSecondPass(QString &docString)
     330              : {
     331              :     // If it was a text-only document, we've prepended it with an ASCII beep.  All we have to do
     332              :     // here is remove the beep and return it.
     333           26 :     if (docString.startsWith('\07')) {
     334            3 :         return docString.mid(1);
     335              :     }
     336              : 
     337           23 :     QXmlStreamReader xml;
     338           23 :     xml.addData(docString);
     339              : 
     340           23 :     QString output;
     341           23 :     QXmlStreamWriter writer(&output);
     342           23 :     writer.setAutoFormatting(false);
     343           23 :     int skip = 0; // Skip stack.
     344           23 :     QString lastTag = "";
     345              : 
     346          458 :     while (!xml.atEnd()) {
     347              :         // Grab the next thingie.
     348          435 :         xml.readNext();
     349              : 
     350          435 :         if (xml.isStartElement()) {
     351          152 :             if (0 == skip) {
     352              :                 // Start
     353          152 :                 QString tagName = xml.name().toString().toLower();
     354          304 :                 QString id = xml.attributes().value(S_ID).toString();
     355              : 
     356          152 :                 if (idsToDelete.contains(id)) {
     357              :                     // We need to delete this tag! Skip it.
     358            6 :                     skip = 1;
     359          146 :                 } else if (tagName == S_IMG) {
     360           16 :                     QString url = xml.attributes().value(S_SRC).toString();
     361              : 
     362            8 :                     int width = 0;
     363            8 :                     int height = 0;
     364              : 
     365              :                     // We got an image.
     366           34 :                     if (xml.attributes().hasAttribute(S_WIDTH) &&
     367           18 :                             xml.attributes().hasAttribute(S_HEIGHT)) {
     368              :                         // Already have attributes?  Cool.
     369           10 :                         width = xml.attributes().value(S_WIDTH).toInt();
     370           10 :                         height = xml.attributes().value(S_HEIGHT).toInt();
     371              :                     } else {
     372            3 :                         QImage image = imageGrabber.getResults()->value(url);
     373            3 :                         if (!image.isNull()) {
     374              :                             // Resize that baby, yeah!
     375            2 :                             imageResize(image.width(), image.height(), &width, &height);
     376              :                         }
     377            3 :                     }
     378              : 
     379            8 :                     if (width > 2 && height > 2) {
     380              :                         // Okay, we got a good image and it's not a tracking pixel. Satisfaction!
     381            7 :                         writer.writeStartElement(tagName);
     382           14 :                         writer.writeAttribute(S_SRC, url);
     383           14 :                         writer.writeAttribute(S_WIDTH, QString::number(width));
     384           14 :                         writer.writeAttribute(S_HEIGHT, QString::number(height));
     385            7 :                         writer.writeAttribute("align", "left"); // Always left-align.
     386              : 
     387            7 :                         lastTag = tagName;
     388              :                     } else {
     389              :                         // Bad image! Skip!
     390            1 :                         skip = 1;
     391              :                     }
     392            8 :                 } else {
     393              :                     // Write the tag and all attributes (except for ID)
     394          138 :                     writer.writeStartElement(tagName);
     395          299 :                     for (QXmlStreamAttribute attribute : xml.attributes()) {
     396          161 :                         if (attribute.name().toString() != S_ID) {
     397           23 :                             writer.writeAttribute(attribute);
     398              :                         }
     399          299 :                     }
     400              : 
     401          138 :                     lastTag = tagName;
     402              :                 }
     403          152 :             } else {
     404            0 :                 skip++;
     405              :             }
     406          283 :         } else if (xml.isEndElement()) {
     407              :             // End
     408          152 :             if (0 == skip) {
     409          145 :                 writer.writeEndElement();
     410              :             } else {
     411            7 :                 skip--;
     412              :             }
     413          131 :         } else if (xml.isCharacters() && 0 == skip) {
     414              :             // Text
     415           81 :             QString text = xml.text().toString();
     416           81 :             if (lastTag != "pre" && lastTag != "code") {
     417              :                 // This happens due to some kind of auto-formatting glitch.
     418           80 :                 text.replace("\n", " ");
     419              :             }
     420              : 
     421           81 :             writer.writeCharacters(text);
     422           81 :             lastTag = "#text";
     423          131 :         } else if (xml.isEntityReference() && 0 == skip) {
     424              :             // Entity
     425            0 :             QString entity = xml.name().toString();
     426            0 :             writer.writeEntityReference(entity);
     427            0 :             lastTag = "#entity";
     428           50 :         } else if (xml.isStartDocument()) {
     429              :             // Doc start
     430           23 :             writer.writeStartDocument(xml.documentVersion().toString());
     431           27 :         } else if (xml.isEndDocument()) {
     432              :             // Doc end;xml.documentVersion()
     433           23 :             writer.writeEndElement();
     434              :         }
     435              :     }
     436              : 
     437           23 :     if (xml.hasError()) {
     438            0 :         qDebug() << "QXmlStreamReader had error: " << xml.errorString();
     439              :     }
     440              : 
     441           23 :     if (writer.hasError()) {
     442            0 :         qDebug() << "QXmlStreamWriter had an error of some kind.";
     443              :     }
     444              : 
     445              :     // Return new document.
     446           23 :     return output;
     447           23 : }
     448              : 
     449           27 : void RawFeedRewriter::postProcess()
     450              : {
     451              :     // Iterate over all the news we have.
     452           54 :     for (RawNews* news : *newsList) {
     453           27 :         if (news->content.size()) {
     454            0 :             postProcessDocString(news->content);
     455              :         }
     456              : 
     457           27 :         if (news->description.size()) {
     458           26 :             postProcessDocString(news->description);
     459              :         }
     460              :     }
     461           27 : }
     462              : 
     463           26 : void RawFeedRewriter::postProcessDocString(QString &docString)
     464              : {
     465              :     // The R is for Redundant!
     466           26 :     docString.replace("\r", "");
     467              : 
     468              :     // Rip out headers/footers.
     469           26 :     docString.replace("<?xml version=\"1.0\"?><html>", "");
     470           26 :     docString.replace("<body>", "");
     471           26 :     docString.replace("</body></html>", "");
     472           26 :     docString.replace("<body/></html>", ""); // Empty body!
     473              : 
     474              :     // This happens.
     475           26 :     docString = docString.trimmed();
     476           26 : }
     477              : 
     478            7 : void RawFeedRewriter::imageResize(int width, int height, int *newWidth, int *newHeight)
     479              : {
     480            7 :     *newWidth = width;
     481            7 :     *newHeight = height;
     482              : 
     483            7 :     if (width >= MAX_ELEMENT_WIDTH) {
     484              :         // Scale down the image.
     485            7 :         *newWidth = MAX_ELEMENT_WIDTH;
     486            7 :         *newHeight = (double) height / (double) width * (double) MAX_ELEMENT_WIDTH;
     487              :     }
     488            7 : }
     489              : 
     490           88 : void RawFeedRewriter::removeNewlinesBothSides(QString &docString)
     491              : {
     492           98 :     while (docString.startsWith("\n")) {
     493           10 :         docString = docString.mid(1);
     494              :     }
     495              : 
     496           95 :     while (docString.endsWith("\n")) {
     497            7 :         docString = docString.left(docString.length() - 1);
     498              :     }
     499           88 : }
     500              : 
     501            3 : QString RawFeedRewriter::rewriteTextOnlyNews(QString input)
     502              : {
     503            3 :     QString output;
     504              : 
     505              :     // Keep it simple, stupid.
     506            3 :     input = input.trimmed();
     507            3 :     input.replace("\r\n", "\r");
     508            3 :     input.replace("\r", "\n");
     509              : 
     510            3 :     QStringList list = input.split('\n', Qt::SkipEmptyParts);
     511            9 :     for (QString line : list) {
     512              :         // Trim lines, and skip empty ones.
     513            6 :         QString trimmed = line.trimmed();
     514            6 :         if (!trimmed.isEmpty()) {
     515            5 :             output += "<p>" + trimmed + "</p>";
     516              :         }
     517            6 :     }
     518              : 
     519              :     // As a signal to the 2nd pass, we prepend the output with an ASCII beep character.  2nd pass
     520              :     // will remove this and return the string without further modification.s
     521            3 :     output = '\07' + output;
     522              : 
     523            6 :     return output;
     524            3 : }
     525              : 
     526            3 : void RawFeedRewriter::onImageGrabberFinished()
     527              : {
     528              :     // Gotta do this, g.
     529            3 :     rewriteAllSecondPass();
     530            3 :     postProcess();
     531              : 
     532            3 :     emit finished();
     533            3 : }
        

Generated by: LCOV version 2.0-1