LCOV - code coverage report
Current view: top level - src/parser - ParserXMLWorker.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 92.6 % 299 277
Test Date: 2026-01-27 22:31:25 Functions: 100.0 % 16 16

            Line data    Source code
       1              : #include "ParserXMLWorker.h"
       2              : #include <QtCore/qtimezone.h>
       3              : #include "../utilities/ErrorHandling.h"
       4              : 
       5           86 : ParserXMLWorker::ParserXMLWorker(QObject *parent) :
       6           86 :     FangObject(parent), feed(nullptr), currentItem(nullptr), isValid(false), inAtomXHTML(false)
       7              : {
       8           86 : }
       9              : 
      10          172 : ParserXMLWorker::~ParserXMLWorker()
      11              : {
      12           86 :     delete feed;
      13          172 : }
      14              : 
      15           60 : void ParserXMLWorker::documentStart()
      16              : {
      17              :     // Make a new feed!  Yay!
      18           60 :     delete feed;
      19           60 :     feed = new RawFeed();
      20           60 :     isValid = true;
      21              :     
      22           60 :     resetParserVars();
      23           60 : }
      24              : 
      25           59 : void ParserXMLWorker::documentEnd()
      26              : {
      27           59 :     if (isValid) {
      28           59 :         if (feed->items.size() == 0) {
      29              :             // Edge case: we typically save the summary when we encounter the first item. This
      30              :             // handles the case where they were no items but we might have a summary.
      31            1 :             saveSummary();
      32              :         }
      33           59 :         emit done(feed);
      34              :     }
      35              :     
      36              :     // If it's not valid, we already emitted a signal.
      37           59 : }
      38              : 
      39           59 : void ParserXMLWorker::addXML(QByteArray data)
      40              : {
      41           59 :     if (!isValid) {
      42            0 :         return;
      43              :     }
      44              :     
      45           59 :     xml.addData(data);
      46              :     
      47        88714 :     while (!xml.atEnd()) {
      48              :         // Grab the next thingie.
      49        88655 :         xml.readNext();
      50              :         
      51        88655 :         if (xml.isStartElement()) {
      52        23936 :             elementStart();
      53        64719 :         } else if (xml.isEndElement()) {
      54        23936 :             elementEnd();
      55        40783 :         } else if (xml.isCharacters() && !xml.isWhitespace()) {
      56        17811 :             elementContents();
      57              :         }
      58              :     }
      59              :     
      60              :     // Standards need to be a bit lax for RSS.
      61           59 :     if (xml.error() && xml.error() != QXmlStreamReader::PrematureEndOfDocumentError &&
      62            0 :             xml.error() != QXmlStreamReader::NotWellFormedError) {
      63            0 :         isValid = false;
      64            0 :         qWarning() << "XML ERROR:" << xml.lineNumber() << ": " << xml.errorString();
      65            0 :         emit done(nullptr);
      66              :     }
      67              :     
      68              : }
      69              : 
      70              : 
      71        23936 : void ParserXMLWorker::elementStart()
      72              : {
      73        23936 :     QString tagName = xml.name().toString().toLower();
      74              :     
      75              :     // Look for start of entries.
      76              :     //qDebug() << "XML node: " << xml.name().toString() << " " << xml.prefix().toString();
      77        23936 :     if ((tagName == "item" || tagName == "entry") && !inAtomXHTML) {
      78              :         
      79         1922 :         if (urlHref.isEmpty()) {
      80         3776 :             urlHref = xml.attributes().value("rss:about").toString();
      81              :         }
      82              :         
      83         1922 :         if (numItems == 0) {
      84              :             // Oh, first item?  Assume we've seen the summary then.
      85           58 :             saveSummary();
      86              :         }
      87              :         
      88         1922 :         currentItem = new RawNews(feed);
      89         1922 :         numItems++;
      90        22756 :     } else if ((tagName == "content" || tagName == "summary") && 
      91        23498 :                xml.attributes().value("type").toString().toLower() == "xhtml") {
      92              :         // Atom has a crappy feature where you can just stick unescaped xhtml
      93              :         // into the Atom's DOM.  Someone at Google must not believe in SAX
      94              :         // parsers, I guess?
      95           99 :         inAtomXHTML = true;
      96        21915 :     } else if (inAtomXHTML) {
      97              :         // Build a string of the tag's elements.
      98         2802 :         QString elements = "";
      99         2802 :         QXmlStreamAttributes attributes = xml.attributes();
     100         4204 :         for (QXmlStreamAttribute attribute : attributes) {
     101         2804 :             elements += " " + attribute.name().toString() + "=\""
     102         4206 :                         + attribute.value().toString() + "\"";
     103         1402 :         }
     104              :         
     105              :         // Mash the tag together.
     106         2802 :         content += "<" + xml.qualifiedName().toString() + elements + ">";
     107              :         
     108              :         // Early exit!
     109         2802 :         return;
     110         2802 :     }
     111              :     
     112        21134 :     currentTag = tagName;
     113        21134 :     currentPrefix = xml.prefix().toString().toLower();
     114        42268 :     hasType = xml.attributes().hasAttribute("type");
     115              :     
     116        23186 :     if (currentTag == "link" && urlHref.isEmpty() && xml.attributes().hasAttribute("href")) {
     117              :         // Used by atom feeds to grab the first link.
     118          416 :         urlHref = xml.attributes().value("href").toString();
     119              :     }
     120              :     
     121              :     // Add this new tag to our stack. :)
     122        21134 :     tagStack.push(tagName);
     123        23936 : }
     124              : 
     125        23936 : void ParserXMLWorker::elementEnd()
     126              : {
     127        23936 :     if (!inAtomXHTML) {
     128        21035 :         tagStack.pop(); // Pop our tag stack, we're through with this one!
     129              :     }
     130              :     
     131        23936 :     QString tagName = xml.name().toString().toLower();
     132              :     
     133        23936 :     if ((tagName == "item" || tagName == "entry") && !inAtomXHTML) {
     134              :         //qDebug() << "End element:" << xml.name().toString();
     135         1922 :         if (currentItem == nullptr) {
     136              :             // Throw some kinda error, this can't happen.
     137            0 :             qDebug() << "Current item is null!";
     138            0 :             qDebug() << "Current title: " << title;
     139            0 :             qDebug() << "Xml element: " << tagName;
     140              :         }
     141              :         
     142              :         // Figure out which date to use.
     143         1922 :         QString timestamp;
     144         1922 :         if (!pubdate.trimmed().isEmpty()) {
     145         1679 :             timestamp = pubdate;
     146          243 :         } else if (!lastbuilddate.trimmed().isEmpty()) {
     147            0 :             timestamp = lastbuilddate;
     148          243 :         } else if (!created.trimmed().isEmpty()) {
     149           15 :             timestamp = created;
     150          228 :         } else if (!date.trimmed().isEmpty()) {
     151           69 :             timestamp = date;
     152          159 :         } else if (!updated.trimmed().isEmpty()) {
     153          159 :             timestamp = updated;
     154              :         }
     155              :         
     156              :         // Determine the GUID.
     157         1922 :         QString myGuid;
     158         1922 :         if (!id.trimmed().isEmpty()) {
     159          174 :             myGuid = id.trimmed();
     160         1748 :         } else if (!guid.trimmed().isEmpty()) {
     161         1314 :             myGuid = guid.trimmed();
     162          434 :         } else if (!urlData.trimmed().isEmpty()) {
     163          434 :             myGuid = urlData.trimmed();
     164              :         } else {
     165            0 :             myGuid = urlHref.trimmed();
     166              :         }
     167              : 
     168              :         // Skip items without a GUID - malformed feed
     169         1922 :         if (myGuid.isEmpty()) {
     170            0 :             qWarning() << "ParserXMLWorker: RSS/Atom item missing GUID/URL, skipping item";
     171            0 :             qWarning() << "  Title:" << title;
     172            0 :             delete currentItem;
     173            0 :             currentItem = nullptr;
     174              : 
     175              :             // Clear all strings for next item
     176            0 :             author = title = subtitle = content = QString();
     177            0 :             urlData = urlHref = guid = id = date = updated = timestamp = QString();
     178            0 :             return;
     179              :         }
     180              : 
     181              :         // Item space.
     182         1922 :         currentItem->author = author;
     183         1922 :         currentItem->title = title;
     184         1922 :         currentItem->description = subtitle;
     185         1922 :         currentItem->content = content;
     186         1922 :         currentItem->url = urlData.isEmpty() ? QUrl(urlHref) : QUrl(urlData);
     187         1922 :         currentItem->timestamp = dateFromFeedString(timestamp);
     188         1922 :         currentItem->guid = myGuid;
     189              :         
     190              :         // Okay, give it up. :(
     191         1922 :         if (!currentItem->timestamp.isValid()) {
     192            0 :             qDebug() << "Time string: " << timestamp;
     193            0 :             qDebug() << "invalid date!";
     194              :         }
     195              :         
     196              :         
     197         1922 :         feed->items.append(currentItem);
     198         1922 :         currentItem = nullptr;
     199              :         
     200              :         // Clear all strings.
     201         1922 :         title = "";
     202         1922 :         urlHref = "";
     203         1922 :         urlData = "";
     204         1922 :         subtitle = "";
     205         1922 :         pubdate = "";
     206         1922 :         lastbuilddate = "";
     207         1922 :         created = "";
     208         1922 :         date = "";
     209         1922 :         updated = "";
     210         1922 :         author = "";
     211         1922 :         content = "";
     212         1922 :         guid = "";
     213         1922 :         id = "";
     214        23936 :     } else if (tagName == "content" || tagName == "summary") {
     215              :         // Just accept that this is the end of one of these:
     216              :         // <contents type="xhtml">
     217          742 :         if (inAtomXHTML) {
     218           99 :             inAtomXHTML = false;
     219           99 :             tagStack.pop(); // We didn't do this earlier, you see.
     220              :         }
     221              :     }
     222              :     
     223        23936 :     if (inAtomXHTML) {
     224              :         // SLORG we need to add this tag to the contents.
     225              :         
     226              :         // TODO: Is there a better way to do this?!
     227         2802 :         content += "</" + xml.qualifiedName().toString() + ">";
     228              :     }
     229        23936 : }
     230              : 
     231        17811 : void ParserXMLWorker::elementContents()
     232              : {
     233        17811 :     if (inAtomXHTML) {
     234              :         // Atom sucks!
     235         1860 :         content += xml.text().toString();
     236              :         
     237         1860 :         return; // Early exit.
     238              :     }
     239              :     
     240        15951 :     QString parentTag = getTagStackAt(1);
     241        15951 :     if (parentTag == "item" || parentTag == "entry") {
     242              :         //
     243              :         // Inside a news item.
     244              :         //
     245              :         
     246        14613 :         if (currentTag == "title" && currentPrefix == "") {
     247         1922 :             title += xml.text().toString();
     248        12691 :         } else if (currentTag == "link" && currentPrefix == "") {
     249         1748 :             urlData += xml.text().toString();
     250        10943 :         } else if (currentTag == "description" || currentTag == "summary") {
     251         1939 :             subtitle += xml.text().toString();
     252         9004 :         } else if (currentTag == "name") {
     253            0 :             author += xml.text().toString();
     254         9004 :         } else if (currentTag == "pubdate") {
     255         1679 :             pubdate += xml.text().toString();
     256         7325 :         } else if (currentTag == "lastbuilddate") {
     257            0 :             lastbuilddate += xml.text().toString();
     258         7325 :         } else if (currentTag == "created") {
     259           15 :             created += xml.text().toString();
     260         7310 :         } else if (currentTag == "updated") {
     261          184 :             updated += xml.text().toString();
     262         7126 :         } else if (currentTag == "date") {
     263          154 :             date += xml.text().toString();
     264         6972 :         } else if (currentTag == "guid") {
     265         1314 :             guid += xml.text().toString();
     266         5658 :         } else if (currentTag == "id") {
     267          174 :             id += xml.text().toString();
     268         5689 :         } else if ((currentTag == "encoded" && currentPrefix == "content")
     269         5689 :                    || (currentTag == "content" && hasType)) {
     270          260 :             content += xml.text().toString();
     271              :         }
     272         1338 :     } else if (parentTag == "channel" || parentTag == "feed") {
     273              :         //
     274              :         // Top level items.
     275              :         //
     276              :         
     277          401 :         if (currentTag == "title" && currentPrefix == "") {
     278           59 :             title += xml.text().toString();
     279          342 :         } else if (currentTag == "link" && currentPrefix == "") {
     280           52 :             urlData += xml.text().toString();
     281          290 :         } else if (currentTag == "description" || currentTag == "summary") {
     282           45 :             subtitle += xml.text().toString();
     283              :         }
     284              :     }
     285        15951 : }
     286              : 
     287           60 : void ParserXMLWorker::resetParserVars()
     288              : {
     289           60 :     xml.clear();
     290              : 
     291           60 :     numItems = 0;
     292           60 :     currentTag = "";
     293           60 :     currentPrefix = "";
     294           60 :     urlHref = "";
     295           60 :     title = "";
     296           60 :     subtitle = "";
     297           60 :     content = "";
     298           60 :     pubdate = "";
     299           60 :     lastbuilddate = "";
     300           60 :     created = "";
     301           60 :     updated = "";
     302           60 :     date = "";
     303           60 :     author = "";
     304           60 :     guid = "";
     305           60 :     id = "";
     306           60 :     hasType = false;
     307           60 :     inAtomXHTML = false;
     308           60 :     tagStack.clear();
     309           60 : }
     310              : 
     311           59 : void ParserXMLWorker::saveSummary()
     312              : {
     313              :     // Global space.
     314           59 :     feed->title = title;
     315           59 :     feed->subtitle = subtitle;
     316           59 :     feed->siteURL = urlData.isEmpty() ? QUrl(urlHref) : QUrl(urlData);
     317              : 
     318              :     // Clear all local strings.
     319           59 :     title = "";
     320           59 :     urlHref = "";
     321           59 :     urlData = "";
     322           59 :     subtitle = "";
     323           59 :     pubdate = "";
     324           59 :     lastbuilddate = "";
     325           59 :     updated = "";
     326           59 :     date = "";
     327           59 :     author = "";
     328           59 :     content = "";
     329           59 :     guid = "";
     330           59 :     id = "";
     331           59 : }
     332              : 
     333              : 
     334         1922 : QDateTime ParserXMLWorker::dateFromFeedString(const QString& _timestamp)
     335              : {
     336         1922 :     QDateTime ret; // Defaults to invalid timestamp.
     337              :     
     338              :     // Come up with a few versions of the time stamp.
     339         1922 :     QString timestamp = _timestamp.trimmed();
     340         1922 :     yearFix(timestamp); //IMPORTANT: Must be done *before* weekday name is shaved.
     341         1922 :     shaveWeekdayName(timestamp);
     342         1922 :     monthMassager(timestamp);
     343              :     QString timestamps[] = {
     344              :         timestamp,
     345         3844 :         timestamp.left(timestamp.lastIndexOf(" ")).trimmed(),
     346         3844 :         timestamp.left(timestamp.lastIndexOf(".")).trimmed(),
     347         3844 :         timestamp.left(timestamp.lastIndexOf("-")).trimmed(),
     348         3844 :         timestamp.left(timestamp.lastIndexOf("+")).trimmed(),
     349              :         
     350              :         "" // must be last
     351        23064 :     };
     352              :     
     353              :     // Date time.  Comes in many (ugh) different formats.
     354              :     const QString dateFormats[] = { 
     355              :         // Most typical RSS format
     356              :         // Example: Tue, 02 Jul 2013 01:01:24 +0000 or Sun, 13 Oct 2013 19:15:29  PST
     357              :         // But Fang shaves off weekday names (see above), because they're useless and are often screwed up.
     358              :         "dd MMM yyyy hh:mm:ss",
     359              :         
     360              :         // One-digit minutes (yes, this happens.)
     361              :         "dd MMM yyyy hh:m:ss",
     362              :         
     363              :         // Same as above, but with full months.
     364              :         "dd MMMM yyyy hh:mm:ss",
     365              :         
     366              :         // Full month, one digit minutes.
     367              :         "dd MMMM yyyy hh:m:ss",
     368              :         
     369              :         // Also same as above, but with potentially single-digit days. (Used by "The Hindu".)
     370              :         "d MMM yyyy hh:mm:ss",
     371              :         
     372              :         // RFC 3339, normally used by Atom.
     373              :         // Example: 2013-08-07T16:47:54Z
     374              :         "yyyy-MM-ddThh:mm:ssZ",
     375              :         
     376              :         // Variant of the above without the trailing Z.
     377              :         // Example: 2012-05-30T19:46:42
     378              :         "yyyy-MM-ddThh:mm:ss",
     379              :         
     380              :         // Variant of the above without seconds OR a trailing Z.
     381              :         // Example: 2012-05-30T19:46
     382              :         "yyyy-MM-ddThh:mm",
     383              :         
     384              :         // Format used by some Chinese site.
     385              :         // Example: 2014-02-27 08:26:16.995
     386              :         "yyyy-MM-dd hh:mm:ss",
     387              :         
     388              :         // "Lokmat" uses this custom format.  I provide a single-spaced version for sanity's sake.
     389              :         // Example: 25-02-2014  01:08:10
     390              :         "dd-MM-yyyy  hh:mm:ss",
     391              :         "dd-MM-yyyy hh:mm:ss",
     392              :         
     393              :         
     394              :         "" // must be last!
     395        26908 :     };
     396              :     
     397              :     // Iterate over date formats.
     398         1922 :     int i = 0;
     399         7019 :     while (!ret.isValid() && !dateFormats[i].isEmpty()) {
     400         5097 :         const QString& format = dateFormats[i];
     401              :         
     402              :         // Try each format against each possible manipulated timestamp.
     403         5097 :         int j = 0;
     404        24946 :         while (!ret.isValid() && !timestamps[j].isEmpty()) {
     405        19849 :             QString& ts = timestamps[j];
     406        19849 :             ret = QDateTime::fromString(ts, format);
     407              :             
     408        19849 :             j++;
     409              :         }
     410              :         
     411         5097 :         i++;
     412              :     }
     413              :     
     414              :     // Check if there's a time-based adjustment and/or timezone.
     415              :     // For now we only look for time identifiers in the format of -hhmm or +hhmm
     416              :     //
     417              :     // TODO: Three-letter time zones. (TLAs like GMT, PST, etc.)
     418              :     //
     419         1922 :     int lastPlus = timestamp.lastIndexOf("+");
     420         1922 :     int lastMinus = timestamp.lastIndexOf("-");
     421         1922 :     if (lastPlus > 3 || lastMinus > 3) {
     422              :         // We have a plus or a minus.
     423         1427 :         int signPos = lastPlus > 3 ? lastPlus : lastMinus;
     424         1427 :         QString sAdjustment = timestamp.right(timestamp.length() - signPos);
     425         1427 :         sAdjustment = sAdjustment.trimmed();
     426              :         
     427              :         // Check for an hour/minute adjustment, in the format of -hhmm or +hhmm
     428              :         // OR in the format of -hh:mm or +hh:mm
     429         2627 :         if ((sAdjustment.length() == 5 || sAdjustment.length() == 6) &&
     430         2627 :                 (sAdjustment.startsWith("+") || sAdjustment.startsWith("-"))) {
     431         1200 :             int adjustment = 0; // Adjustment in minutes.
     432         1200 :             bool containsCol = sAdjustment.contains(':');
     433         1200 :             bool isNum = false;
     434         1200 :             int hours = 0;
     435         1200 :             int minutes = 0;
     436              :             
     437         1200 :             QString sNumber = sAdjustment.right(containsCol ? 5 : 4); // Skip + or -
     438              :             // YES!  We've got an adjustment!
     439         1200 :             hours = sNumber.left(2).toInt(&isNum);
     440         1200 :             if (isNum)
     441         1200 :                 minutes = sNumber.right(2).toInt(&isNum);
     442              :             
     443              :             // Looks like we're good!
     444         1200 :             if (isNum) {
     445              :                 // Condense down to minutes.
     446         1200 :                 minutes += (hours * 60);
     447         1200 :                 adjustment = sAdjustment.startsWith("-") ? minutes : -minutes;
     448              :                 
     449              :                 // Add in our adjustment if we need it.
     450         1200 :                 ret = ret.addSecs(adjustment * 60 /* seconds */);
     451              :             }
     452         1200 :         }
     453         1427 :     }
     454              :     
     455              :     // All times are (supposedly) in UTC.
     456         1922 :     ret.setTimeZone(QTimeZone::UTC);
     457              : 
     458         3844 :     return ret;
     459        40362 : }
     460              : 
     461              : 
     462         1922 : void ParserXMLWorker::yearFix(QString& timestamp)
     463              : {
     464              :     // If the timestamp is something like this:
     465              :     // Tue, 02 Jul 13 [etc]
     466              :     // We want to make it something like this:
     467              :     // Tue, 02 Jul 2013 [etc]
     468         1922 :     if (timestamp.length() == 0 || !timestamp[0].isLetter()) {
     469          331 :         return; // Early exit.
     470              :     }
     471              :     
     472         1591 :     bool seenWeekday = false;
     473         1591 :     bool seenDay = false;
     474         1591 :     bool seenMonth = false;
     475         1591 :     bool seenYear = false;
     476         1591 :     bool hitSpace = true; // This controls whether or not we examine the character.
     477         1591 :     int charsInYear = 0;
     478        27047 :     for (int i = 0; i < timestamp.length(); i++) {
     479        27047 :         if (hitSpace && (timestamp[i].isLetter() || timestamp[i] == ',')) {
     480         3182 :             hitSpace = false; // reset
     481              :             
     482         3182 :             if (!seenWeekday) {
     483         1591 :                 seenWeekday = true;
     484         1591 :             } else if (!seenMonth) {
     485         1591 :                 seenMonth = true;
     486              :             }
     487        23865 :         } else if (hitSpace && (timestamp[i].isDigit())) {
     488         3182 :             hitSpace = false; // reset
     489              :             
     490         3182 :             if (!seenDay) {
     491         1591 :                 seenDay = true;
     492         1591 :             } else if (!seenYear) {
     493         1591 :                 seenYear = true;
     494              :             }
     495        20683 :         } else if (timestamp[i].isSpace()) {
     496         6364 :             if (seenYear) {
     497              :                 // Here's where we find out if we can leave yet.
     498         1591 :                 if (charsInYear != 2) {
     499         1575 :                     break; // Early exit!
     500              :                 } else {
     501              :                     // Sigh... okay, now we have to back up and insert a "20".
     502              :                     // Currently we're here: [Tue, 02 Jul 13 ]
     503           16 :                     timestamp = timestamp.insert(i - 2, "20");
     504           16 :                     return; // YAY! WE DID IT!
     505              :                 }
     506              :             } else {
     507         4773 :                 hitSpace = true;
     508              :             }
     509              :         }
     510              :         
     511        25456 :         if (seenYear) {
     512         6332 :             ++charsInYear;
     513              :         }
     514              :     }
     515              : }
     516              : 
     517              : 
     518         1922 : void ParserXMLWorker::shaveWeekdayName(QString& timestamp)
     519              : {
     520              :     // NOTE:
     521              :     // By the time we've reached this method, the timestamp has
     522              :     // already been trimmed, and we've made sure the year has four digits.
     523              :     
     524         1922 :     int comma = timestamp.indexOf(',');
     525         1922 :     if (comma < 0) {
     526          331 :         return; // Early exit.
     527              :     }
     528              :     
     529              :     // Remove up to and including the comma itself.
     530         1591 :     timestamp = timestamp.remove(0, comma + 1).trimmed();
     531              : }
     532              : 
     533         1922 : void ParserXMLWorker::monthMassager(QString& timestamp)
     534              : {
     535              :     // Add new ones as they're encountered.
     536         1922 :     timestamp = timestamp.replace("Sept ", "Sep ");
     537         1922 : }
     538              : 
     539              : 
     540        15951 : QString ParserXMLWorker::getTagStackAt(qint32 n)
     541              : {
     542              :     // n is from 0..size - 1
     543        15951 :     if (tagStack.isEmpty() || (tagStack.size() - 1) < n)
     544            0 :         return "";
     545              :     
     546        15951 :     return tagStack.at(tagStack.size() - 1 - n);
     547              : }
        

Generated by: LCOV version 2.0-1