LCOV - code coverage report
Current view: top level - lib/FangFeedParser - RSSAtomParser.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 98.2 % 221 217
Test Date: 2026-04-19 00:35:54 Functions: 100.0 % 9 9

            Line data    Source code
       1              : #include "RSSAtomParser.h"
       2              : #include "FeedDateParser.h"
       3              : #include "FeedParserLogging.h"
       4              : 
       5              : // Returns true if the string contains only whitespace (or is empty).
       6         7912 : static bool isBlankOrEmpty(const QString& s)
       7              : {
       8         7912 :     return QStringView(s).trimmed().isEmpty();
       9              : }
      10              : 
      11              : // Some feeds (e.g. excelsior.com.mx) double-escape CDATA markers, producing
      12              : // literal "<![CDATA[...]]>" text instead of actual CDATA sections. Strip them.
      13         6705 : static QString stripEscapedCDATA(const QString& text)
      14              : {
      15         6705 :     QStringView view = QStringView(text).trimmed();
      16         6706 :     if (view.startsWith(u"<![CDATA[") && view.endsWith(u"]]>")) {
      17            1 :         return view.mid(9, view.size() - 12).trimmed().toString();
      18              :     }
      19         6704 :     return text;
      20              : }
      21              : 
      22           96 : std::unique_ptr<RawFeed> RSSAtomParser::parse(const QByteArray& data)
      23              : {
      24           96 :     RSSAtomParser worker;
      25           96 :     worker.feed = std::make_unique<RawFeed>();
      26           96 :     worker.isValid = true;
      27              : 
      28           96 :     worker.xml.addData(data);
      29       101720 :     while (!worker.xml.atEnd()) {
      30       101624 :         worker.xml.readNext();
      31       101624 :         if (worker.xml.isStartElement()) {
      32        27290 :             worker.elementStart();
      33        74334 :         } else if (worker.xml.isEndElement()) {
      34        27287 :             worker.elementEnd();
      35        47047 :         } else if (worker.xml.isCharacters() && !worker.xml.isWhitespace()) {
      36        20387 :             worker.elementContents();
      37              :         }
      38              :     }
      39              : 
      40           96 :     if (worker.xml.error()
      41            3 :         && worker.xml.error() != QXmlStreamReader::PrematureEndOfDocumentError
      42           99 :         && worker.xml.error() != QXmlStreamReader::NotWellFormedError) {
      43            0 :         worker.isValid = false;
      44            0 :         qCWarning(logFeedParser) << "XML ERROR:" << worker.xml.lineNumber()
      45            0 :                                  << ": " << worker.xml.errorString();
      46              :     }
      47              : 
      48           96 :     if (!worker.isValid) {
      49            0 :         return nullptr;
      50              :     }
      51              : 
      52           96 :     if (worker.feed->items.size() == 0) {
      53            4 :         worker.saveSummary();
      54              :     }
      55              : 
      56              :     // No items and no feed title means we never found RSS/Atom content.
      57           96 :     if (worker.feed->items.isEmpty() && worker.feed->title.isEmpty()) {
      58            2 :         return nullptr;
      59              :     }
      60              : 
      61           94 :     return std::move(worker.feed);
      62           96 : }
      63              : 
      64        27290 : void RSSAtomParser::elementStart()
      65              : {
      66        27290 :     QString tagName = xml.name().toString().toLower();
      67              : 
      68        27290 :     if ((tagName == "item" || tagName == "entry") && !state.inAtomXHTML) {
      69              : 
      70         2238 :         if (state.urlHref.isEmpty()) {
      71         4390 :             state.urlHref = xml.attributes().value("rss:about").toString();
      72              :         }
      73              : 
      74         2238 :         if (state.numItems == 0) {
      75           93 :             saveSummary();
      76              :         }
      77              : 
      78         2238 :         currentItem = std::make_shared<RawNews>();
      79         2238 :         state.numItems++;
      80        25874 :     } else if ((tagName == "content" || tagName == "summary") &&
      81        26696 :                xml.attributes().value("type").toString().toLower() == "xhtml") {
      82           99 :         state.inAtomXHTML = true;
      83           99 :         state.content.reserve(4096);
      84        24953 :     } else if (state.inAtomXHTML) {
      85              :         // Rebuild the XHTML tag directly into content (no temporaries).
      86         2802 :         state.content.append('<');
      87         2802 :         state.content.append(xml.qualifiedName());
      88         4204 :         for (const auto& attribute : xml.attributes()) {
      89         1402 :             state.content.append(' ');
      90         1402 :             state.content.append(attribute.name());
      91         2804 :             state.content.append(QStringLiteral("=\""));
      92         1402 :             state.content.append(attribute.value());
      93         1402 :             state.content.append('"');
      94         2802 :         }
      95         2802 :         state.content.append('>');
      96              : 
      97         2802 :         return;
      98              :     }
      99              : 
     100        24488 :     state.currentTag = tagName;
     101        24488 :     state.currentPrefix = xml.prefix().toString().toLower();
     102        48976 :     state.hasType = xml.attributes().hasAttribute("type");
     103              : 
     104              :     // Podcast detection: only flag itunes elements that are specific to actual
     105              :     // podcast feeds. Many non-podcast feeds (e.g. Substack blogs) include generic
     106              :     // itunes metadata like itunes:owner, itunes:author, and itunes:block.
     107        24488 :     if (state.currentPrefix == "itunes") {
     108         1441 :         if (state.currentTag == "duration" || state.currentTag == "episode"
     109          655 :                 || state.currentTag == "episodetype" || state.currentTag == "season"
     110         1441 :                 || state.currentTag == "explicit" || state.currentTag == "category") {
     111          264 :             state.hasPodcastSignals = true;
     112              :         }
     113              :     }
     114              : 
     115              :     // Podcast detection: audio enclosures.
     116        24488 :     if (state.currentTag == "enclosure") {
     117          796 :         QString type = xml.attributes().value("type").toString().toLower();
     118          398 :         if (type.startsWith("audio/")) {
     119          254 :             state.hasPodcastSignals = true;
     120              :         }
     121          398 :     }
     122              : 
     123              :     // Media RSS image extraction (media:thumbnail and media:content).
     124        24488 :     if (currentItem && state.currentPrefix == "media") {
     125         1444 :         if (state.currentTag == "thumbnail") {
     126         1206 :             QString url = xml.attributes().value("url").toString();
     127         1206 :             int width = xml.attributes().value("width").toString().toInt();
     128          603 :             if (!url.isEmpty() && (state.mediaImageURL.isEmpty() || width > state.mediaImageWidth)) {
     129          603 :                 state.mediaImageURL = url;
     130          603 :                 state.mediaImageWidth = width;
     131              :             }
     132         1444 :         } else if (state.currentTag == "content") {
     133          834 :             QString type = xml.attributes().value("type").toString().toLower();
     134          417 :             if (type.startsWith("image/")) {
     135          470 :                 QString url = xml.attributes().value("url").toString();
     136          470 :                 int width = xml.attributes().value("width").toString().toInt();
     137          235 :                 if (!url.isEmpty() && (state.mediaImageURL.isEmpty() || width > state.mediaImageWidth)) {
     138          173 :                     state.mediaImageURL = url;
     139          173 :                     state.mediaImageWidth = width;
     140              :                 }
     141          235 :             }
     142          417 :         }
     143              :     }
     144              : 
     145        26896 :     if (state.currentTag == "link" && state.urlHref.isEmpty() && xml.attributes().hasAttribute("href")) {
     146          444 :         state.urlHref = xml.attributes().value("href").toString();
     147              :     }
     148              : 
     149        24488 :     state.tagStack.push(tagName);
     150        27290 : }
     151              : 
     152        27287 : void RSSAtomParser::elementEnd()
     153              : {
     154        27287 :     if (!state.inAtomXHTML) {
     155        24386 :         state.tagStack.pop();
     156              :     }
     157              : 
     158        27287 :     QString tagName = xml.name().toString().toLower();
     159              : 
     160        27287 :     if ((tagName == "item" || tagName == "entry") && !state.inAtomXHTML) {
     161         2237 :         if (!currentItem) {
     162            2 :             qCWarning(logFeedParser) << "Current item is null!";
     163            2 :             qCWarning(logFeedParser) << "Current title: " << state.title;
     164            2 :             qCWarning(logFeedParser) << "Xml element: " << tagName;
     165            2 :             return;
     166              :         }
     167              : 
     168              :         // Figure out which date to use.
     169         2236 :         QString timestamp;
     170         2236 :         if (!isBlankOrEmpty(state.pubdate)) {
     171         1986 :             timestamp = state.pubdate;
     172          250 :         } else if (!isBlankOrEmpty(state.lastbuilddate)) {
     173            1 :             timestamp = state.lastbuilddate;
     174          249 :         } else if (!isBlankOrEmpty(state.created)) {
     175           15 :             timestamp = state.created;
     176          234 :         } else if (!isBlankOrEmpty(state.date)) {
     177           69 :             timestamp = state.date;
     178          165 :         } else if (!isBlankOrEmpty(state.updated)) {
     179          164 :             timestamp = state.updated;
     180              :         }
     181              : 
     182              :         // Determine the GUID.
     183         2236 :         QString myGuid;
     184         2236 :         if (!isBlankOrEmpty(state.id)) {
     185          179 :             myGuid = state.id.trimmed();
     186         2057 :         } else if (!isBlankOrEmpty(state.guid)) {
     187         1572 :             myGuid = state.guid.trimmed();
     188          485 :         } else if (!isBlankOrEmpty(state.urlData)) {
     189          484 :             myGuid = state.urlData.trimmed();
     190              :         } else {
     191            1 :             myGuid = state.urlHref.trimmed();
     192              :         }
     193              : 
     194              :         // Skip items without a GUID - malformed feed
     195         2236 :         if (myGuid.isEmpty()) {
     196            2 :             qCWarning(logFeedParser) << "RSSAtomParser: RSS/Atom item missing GUID/URL, skipping item";
     197            2 :             qCWarning(logFeedParser) << "  Title:" << state.title;
     198            1 :             currentItem.reset();
     199            1 :             state.clearItemFields();
     200            1 :             return;
     201              :         }
     202              : 
     203              :         // Item space.
     204         2235 :         currentItem->author = state.author;
     205         2235 :         currentItem->title = stripEscapedCDATA(state.title);
     206         2235 :         currentItem->description = stripEscapedCDATA(state.subtitle);
     207         2235 :         currentItem->content = stripEscapedCDATA(state.content);
     208              : 
     209         2235 :         currentItem->mediaImageURL = state.mediaImageURL;
     210              : 
     211         2235 :         currentItem->url = state.urlData.isEmpty() ? QUrl(state.urlHref) : QUrl(state.urlData);
     212         2235 :         currentItem->timestamp = FeedDateParser::dateFromFeedString(timestamp);
     213         2235 :         currentItem->guid = myGuid;
     214              : 
     215         2235 :         if (!currentItem->timestamp.isValid()) {
     216            2 :             qCDebug(logFeedParser) << "Time string: " << timestamp;
     217            2 :             qCDebug(logFeedParser) << "invalid date!";
     218              :         }
     219              : 
     220              : 
     221         2235 :         feed->items.append(currentItem);
     222         2235 :         feed->isPodcast = feed->isPodcast || state.hasPodcastSignals;
     223         2235 :         currentItem.reset();
     224              : 
     225         2235 :         state.clearItemFields();
     226        27287 :     } else if (tagName == "content" || tagName == "summary") {
     227          822 :         if (state.inAtomXHTML) {
     228           99 :             state.inAtomXHTML = false;
     229           99 :             state.tagStack.pop();
     230              :         }
     231              :     }
     232              : 
     233        27285 :     if (state.inAtomXHTML) {
     234         5604 :         state.content.append(QStringLiteral("</"));
     235         2802 :         state.content.append(xml.qualifiedName());
     236         2802 :         state.content.append('>');
     237              :     }
     238        27287 : }
     239              : 
     240        20387 : void RSSAtomParser::elementContents()
     241              : {
     242        20387 :     if (state.inAtomXHTML) {
     243         1860 :         state.content += xml.text().toString();
     244         1860 :         return;
     245              :     }
     246              : 
     247        18527 :     QStringView parentTag = getTagStackAt(1);
     248        20866 :     if (parentTag == u"item" || parentTag == u"entry") {
     249              :         //
     250              :         // Inside a news item.
     251              :         //
     252              : 
     253        16989 :         if (state.currentTag == "title" && state.currentPrefix == "") {
     254         2235 :             state.title += xml.text().toString();
     255        14754 :         } else if (state.currentTag == "link" && state.currentPrefix == "") {
     256         2057 :             state.urlData += xml.text().toString();
     257        23411 :         } else if ((state.currentTag == "description" || state.currentTag == "summary")
     258        23411 :                    && state.currentPrefix == "") {
     259         2106 :             state.subtitle += xml.text().toString();
     260        10591 :         } else if (state.currentTag == "name"
     261        10591 :                    || (state.currentTag == "creator" && state.currentPrefix == "dc")) {
     262          671 :             state.author += xml.text().toString();
     263         9920 :         } else if (state.currentTag == "pubdate") {
     264         1986 :             state.pubdate += xml.text().toString();
     265         7934 :         } else if (state.currentTag == "lastbuilddate") {
     266            1 :             state.lastbuilddate += xml.text().toString();
     267         7933 :         } else if (state.currentTag == "created") {
     268           15 :             state.created += xml.text().toString();
     269         7918 :         } else if (state.currentTag == "updated") {
     270          189 :             state.updated += xml.text().toString();
     271         7729 :         } else if (state.currentTag == "date") {
     272          154 :             state.date += xml.text().toString();
     273         7575 :         } else if (state.currentTag == "guid") {
     274         1572 :             state.guid += xml.text().toString();
     275         6003 :         } else if (state.currentTag == "id") {
     276          179 :             state.id += xml.text().toString();
     277         6089 :         } else if ((state.currentTag == "encoded" && state.currentPrefix == "content")
     278         6089 :                    || (state.currentTag == "content" && state.hasType)) {
     279          320 :             state.content += xml.text().toString();
     280              :         }
     281         2563 :     } else if (parentTag == u"channel" || parentTag == u"feed") {
     282              :         //
     283              :         // Top level items.
     284              :         //
     285              : 
     286          547 :         if (state.currentTag == "title" && state.currentPrefix == "") {
     287           94 :             state.title += xml.text().toString();
     288          453 :         } else if (state.currentTag == "link" && state.currentPrefix == "") {
     289           85 :             state.urlData += xml.text().toString();
     290          679 :         } else if ((state.currentTag == "description" || state.currentTag == "summary")
     291          679 :                    && state.currentPrefix == "") {
     292           57 :             state.subtitle += xml.text().toString();
     293              :         }
     294              :     }
     295              : }
     296              : 
     297         2333 : void RSSAtomParser::ParseState::clearItemFields()
     298              : {
     299         2333 :     title.clear();
     300         2333 :     subtitle.clear();
     301         2333 :     content.clear();
     302         2333 :     author.clear();
     303         2333 :     urlHref.clear();
     304         2333 :     urlData.clear();
     305         2333 :     pubdate.clear();
     306         2333 :     lastbuilddate.clear();
     307         2333 :     created.clear();
     308         2333 :     updated.clear();
     309         2333 :     date.clear();
     310         2333 :     guid.clear();
     311         2333 :     id.clear();
     312         2333 :     mediaImageURL.clear();
     313         2333 :     mediaImageWidth = 0;
     314         2333 : }
     315              : 
     316           97 : void RSSAtomParser::saveSummary()
     317              : {
     318           97 :     feed->title = state.title;
     319           97 :     feed->subtitle = state.subtitle;
     320           97 :     feed->siteURL = state.urlData.isEmpty() ? QUrl(state.urlHref) : QUrl(state.urlData);
     321           97 :     feed->isPodcast = state.hasPodcastSignals;
     322              : 
     323           97 :     state.clearItemFields();
     324           97 : }
     325              : 
     326              : 
     327        18527 : QStringView RSSAtomParser::getTagStackAt(qint32 n)
     328              : {
     329        18527 :     if (state.tagStack.isEmpty() || (state.tagStack.size() - 1) < n) {
     330            1 :         return QStringView();
     331              :     }
     332              : 
     333        18526 :     return state.tagStack.at(state.tagStack.size() - 1 - n);
     334              : }
        

Generated by: LCOV version 2.0-1