LCOV - code coverage report
Current view: top level - src/utilities - FeedDiscovery.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 94.8 % 173 164
Test Date: 2026-01-27 22:31:25 Functions: 95.2 % 21 20

            Line data    Source code
       1              : #include "FeedDiscovery.h"
       2              : #include <QXmlStreamReader>
       3              : #include <QSet>
       4              : #include <algorithm>
       5              : #include <QDebug>
       6              : #include "NetworkUtilities.h"
       7              : #include "ErrorHandling.h"
       8              : #include "../parser/NewsParser.h"
       9              : #include "../parser/BatchNewsParser.h"
      10              : #include "WebPageGrabber.h"
      11              : 
      12           44 : FeedDiscovery::FeedDiscovery(QObject *parent,
      13              :                            ParserInterface* firstParser,
      14              :                            ParserInterface* secondParser,
      15              :                            WebPageGrabber* pageGrabber,
      16           44 :                            BatchNewsParser* feedParser) :
      17              :     FangObject(parent),
      18           44 :     machine(),
      19           44 :     _error(false),
      20           44 :     _errorString(""),
      21           88 :     _feedResult(nullptr)
      22              : {
      23              :     // Handle secondParser: no longer used, but we need to clean it up if provided
      24           44 :     if (secondParser) {
      25           17 :         if (!secondParser->parent()) {
      26           17 :             secondParser->setParent(this);  // Take ownership so it gets cleaned up
      27              :         }
      28              :     }
      29              : 
      30              :     // Create default implementations if not provided (with this as parent for auto-cleanup)
      31           44 :     parserFirstTry = firstParser ? firstParser : new NewsParser(this);
      32           44 :     this->pageGrabber = pageGrabber ? pageGrabber : new WebPageGrabber(this);
      33           44 :     this->feedParser = feedParser ? feedParser : new BatchNewsParser(this);
      34              : 
      35              :     // Take ownership of injected dependencies by setting parent
      36           44 :     if (parserFirstTry && !parserFirstTry->parent()) {
      37           17 :         parserFirstTry->setParent(this);
      38              :     }
      39           44 :     if (this->pageGrabber && !this->pageGrabber->parent()) {
      40           17 :         this->pageGrabber->setParent(this);
      41              :     }
      42           44 :     if (this->feedParser && !this->feedParser->parent()) {
      43           17 :         this->feedParser->setParent(this);
      44              :     }
      45              : 
      46              :     // Set up our state machine.
      47           71 :     machine.addStateChange(CHECK_FEED, TRY_FEED, [this]() { onTryFeed(); });
      48           61 :     machine.addStateChange(TRY_FEED, FEED_FOUND, [this]() { onFeedFound(); });
      49           53 :     machine.addStateChange(TRY_FEED, WEB_GRABBER, [this]() { onWebGrabber(); });
      50           46 :     machine.addStateChange(WEB_GRABBER, VALIDATE_FEEDS, [this]() { onValidateFeeds(); });
      51           45 :     machine.addStateChange(VALIDATE_FEEDS, FEED_FOUND, [this]() { onFeedFound(); });
      52              : 
      53           52 :     machine.addStateChange(-1, FEED_ERROR, [this]() { onError(); }); // All errors.
      54              : 
      55              :     // Parser signals.
      56           44 :     connect(parserFirstTry, &ParserInterface::done, this, &FeedDiscovery::onFirstParseDone);
      57              : 
      58              :     // Web page grabber signals.
      59           44 :     connect(this->pageGrabber, &WebPageGrabber::ready, this, &FeedDiscovery::onPageGrabberReady);
      60           44 :     connect(this->feedParser, &BatchNewsParser::ready, this, &FeedDiscovery::onFeedParserReady);
      61           44 : }
      62              : 
      63           44 : FeedDiscovery::~FeedDiscovery()
      64              : {
      65              :     // Qt parent/child hierarchy handles cleanup automatically
      66           44 : }
      67              : 
      68           27 : void FeedDiscovery::checkFeed(QString sURL)
      69              : {
      70              :     // Reset state
      71           27 :     _error = false;
      72           27 :     _errorString = "";
      73           27 :     _discoveredFeeds.clear();
      74           27 :     _sortedFeedURLs.clear();
      75           27 :     machine.start(CHECK_FEED);
      76              : 
      77           27 :     QUrl url = NetworkUtilities::urlFixup(sURL);
      78              :     
      79              :     // Make sure the location isn't a "relative" (and therefore severely invalid) path.
      80           27 :     if (url.isRelative() || url.scheme().isEmpty()) {
      81              :         // Try adjusting the scheme.
      82            0 :         if (url.scheme() == "") {
      83            0 :             url.setScheme("http");
      84              :         }
      85              :         
      86              :         //qDebug() << "Location is adjusted to: " << location;
      87              :         
      88              :         // Final check!  If it's not valid, we'll set an error and bail.
      89            0 :         if (url.isRelative()) {
      90            0 :             reportError("Invalid URL");
      91              :             
      92            0 :             return;
      93              :         }
      94              :     }
      95              :     
      96              :     // Okay, we have a potential URL! Let's check it.
      97           27 :     _feedURL = url;
      98           27 :     machine.setState(TRY_FEED);
      99           27 : }
     100              : 
     101           27 : void FeedDiscovery::onTryFeed()
     102              : {
     103           27 :     parserFirstTry->parse(_feedURL);
     104           27 : }
     105              : 
     106           18 : void FeedDiscovery::onFeedFound()
     107              : {
     108           18 :     FANG_CHECK(!_error, "FeedDiscovery::onFeedFound called with _error set");
     109           18 :     FANG_CHECK(!_feedURL.isEmpty(), "FeedDiscovery::onFeedFound called with empty _feedURL");
     110              : 
     111           18 :     emit done(this);
     112           18 : }
     113              : 
     114            9 : void FeedDiscovery::onWebGrabber()
     115              : {
     116            9 :     pageGrabber->load(_feedURL);
     117            9 : }
     118              : 
     119            8 : void FeedDiscovery::onError()
     120              : {
     121            8 :     FANG_CHECK(_error, "FeedDiscovery::onError called without _error set");
     122            8 :     FANG_CHECK(!_errorString.isEmpty(), "FeedDiscovery::onError called with empty _errorString");
     123              : 
     124            8 :     emit done(this);
     125            8 : }
     126              : 
     127           26 : void FeedDiscovery::onFirstParseDone()
     128              : {
     129           26 :     int res = parserFirstTry->getResult();
     130           26 :     switch (res) {
     131           17 :     case ParserInterface::OK:
     132              :     {
     133              :         // User directly entered a feed URL! Add it to discovered feeds
     134           17 :         _feedURL = parserFirstTry->getURL();
     135           17 :         _feedResult = parserFirstTry->getFeed();
     136              : 
     137              :         // Add to discovered feeds list
     138           17 :         DiscoveredFeed discovered;
     139           17 :         discovered.url = _feedURL;
     140           17 :         discovered.feed = _feedResult;
     141           17 :         discovered.title = _feedResult ? _feedResult->title : _feedURL.toString();
     142           17 :         discovered.validated = true;
     143           17 :         _discoveredFeeds.clear();
     144           17 :         _discoveredFeeds.append(discovered);
     145              : 
     146           17 :         machine.setState(FEED_FOUND);
     147           17 :         break;
     148           17 :     }
     149              : 
     150            9 :     case ParserInterface::NETWORK_ERROR:
     151              :     case ParserInterface::FILE_ERROR:
     152              :     case ParserInterface::EMPTY_DOCUMENT:
     153              :     case ParserInterface::PARSE_ERROR:
     154              :         // Not a feed, probably HTML. Continue to the web grabber stage.
     155            9 :         machine.setState(WEB_GRABBER);
     156            9 :         break;
     157              : 
     158            0 :     case ParserInterface::IN_PROGRESS:
     159              :     default:
     160            0 :         FANG_UNREACHABLE("Unexpected parser result in onFirstParseDone");
     161              :         // Treat as error and continue to web grabber
     162              :         machine.setState(WEB_GRABBER);
     163              :         break;
     164              :     }
     165           26 : }
     166              : 
     167            9 : void FeedDiscovery::onPageGrabberReady(WebPageGrabber* grabber, QString* document)
     168              : {
     169              :     Q_UNUSED(grabber);
     170              : 
     171              :     // If we didn't get a document, bail here.
     172            9 :     if (!document || document->isEmpty()) {
     173            6 :         reportError("No page found");
     174            7 :         return;
     175              :     }
     176              : 
     177              :     // Parse feed URLs from the HTML document
     178            3 :     QList<QString> feedURLs = parseFeedsFromXHTML(*document);
     179            3 :     qDebug() << "Parsed" << feedURLs.count() << "feed URLs from HTML";
     180              : 
     181            3 :     if (feedURLs.isEmpty()) {
     182            1 :         qDebug() << "No feeds found in HTML!";
     183            1 :         reportError("No feed found");
     184            1 :         return;
     185              :     }
     186              : 
     187            2 :     qDebug() << "Total feed URLs found:" << feedURLs.count();
     188              : 
     189              :     // Sort by path length (longer paths first = more specific)
     190            2 :     QList<QString> feedURLStrings = feedURLs;
     191            2 :     std::sort(feedURLStrings.begin(), feedURLStrings.end(),
     192            3 :         [](const QString& a, const QString& b) {
     193            3 :             QUrl urlA(a);
     194            3 :             QUrl urlB(b);
     195            6 :             return urlA.path().length() > urlB.path().length();
     196            3 :         });
     197              : 
     198              :     // Convert to QUrl list and store for validation
     199            2 :     _sortedFeedURLs.clear();
     200            6 :     for (const QString& urlString : feedURLStrings) {
     201            4 :         _sortedFeedURLs.append(QUrl(urlString));
     202              :     }
     203              : 
     204              :     // Trigger bulk feed validation
     205            2 :     machine.setState(VALIDATE_FEEDS);
     206            3 : }
     207              : 
     208           29 : QList<QString> FeedDiscovery::parseFeedsFromXHTML(const QString& document)
     209              : {
     210           29 :     QList<QString> feedsFound;
     211              : 
     212              :     // Examples of what we're looking for:
     213              :     // <link rel="alternate" href="http://www.fark.com/fark.rss" type="application/rss+xml" title="FARK.com Fark RSS Feed">
     214              :     // <link rel="alternate" type="application/rss+xml" title="MrEricSir.com RSS Feed" href="http://www.mrericsir.com/blog/feed/" />
     215              :     // <link rel="alternate" type="application/atom+xml" title="MrEricSir.com Atom Feed" href="http://www.mrericsir.com/blog/feed/atom/" />
     216           29 :     const QString S_REL = "rel";
     217           29 :     const QString S_HREF = "href";
     218           29 :     const QString S_TYPE = "type";
     219           29 :     const QString S_TITLE = "title";
     220           29 :     const QString S_WORDPRESS_COMMENTS_URL_SUFFIX = "/comments/feed/";
     221              : 
     222           29 :     QXmlStreamReader xml;
     223           29 :     xml.addData(document);
     224              : 
     225         6393 :     while (!xml.atEnd()) {
     226              :         // Grab the next thingie.
     227         6363 :         xml.readNext();
     228              : 
     229         6363 :         if (xml.isStartElement()) {
     230         1610 :             QString tagName = xml.name().toString().toLower();
     231         1610 :             if (tagName == "body") {
     232              :                 // We're done with the header, so bail.
     233           28 :                 return feedsFound;
     234              :             }
     235              : 
     236         1582 :             if (tagName == "link") {
     237          479 :                 QXmlStreamAttributes attributes = xml.attributes();
     238              : 
     239              :                 // Is this a feed?
     240         1418 :                 if (attributes.hasAttribute(S_REL) && attributes.hasAttribute(S_HREF) &&
     241         1449 :                     attributes.value("", S_REL).toString().toLower() == "alternate" &&
     242         1494 :                     attributes.hasAttribute(S_TYPE) &&
     243          573 :                     (attributes.value("", S_TYPE).toString().toLower() == "application/rss+xml" ||
     244          499 :                      attributes.value("", S_TYPE).toString().toLower() == "application/atom+xml")) {
     245              :                     // Run some checks and then add our feed if it seems reasonable to do so.
     246           70 :                     QString url = attributes.value("", S_HREF).toString();
     247              : 
     248              :                     // Avoid comments feeds as they tend to get added by accident.
     249           35 :                     if (url.endsWith(S_WORDPRESS_COMMENTS_URL_SUFFIX)) {
     250            7 :                         continue;
     251              :                     }
     252              : 
     253           28 :                     feedsFound << url;
     254           35 :                 }
     255          479 :             }
     256         1610 :         }
     257              :     }
     258              : 
     259            1 :     return feedsFound;
     260           29 : }
     261              : 
     262            2 : void FeedDiscovery::onValidateFeeds()
     263              : {
     264              :     // Use the sorted feed URLs from onPageGrabberReady
     265            2 :     if (_sortedFeedURLs.isEmpty()) {
     266            0 :         reportError("No feeds to validate");
     267            0 :         return;
     268              :     }
     269              : 
     270              :     // Bulk parse all feed URLs
     271            2 :     feedParser->parse(_sortedFeedURLs);
     272              : }
     273              : 
     274            2 : void FeedDiscovery::onFeedParserReady()
     275              : {
     276              :     // Process all parsed feeds
     277            2 :     _discoveredFeeds.clear();
     278              : 
     279            2 :     QMap<QUrl, ParserInterface::ParseResult> results = feedParser->getResults();
     280            6 :     for (auto it = results.constBegin(); it != results.constEnd(); ++it) {
     281            4 :         QUrl feedURL = it.key();
     282            4 :         ParserInterface::ParseResult result = it.value();
     283              : 
     284              :         // Only include successfully parsed feeds
     285            4 :         if (result == ParserInterface::OK) {
     286            2 :             RawFeed* feed = feedParser->getFeed(feedURL);
     287            2 :             if (feed) {
     288            2 :                 DiscoveredFeed discovered;
     289            2 :                 discovered.url = feedURL;
     290            2 :                 discovered.feed = feed;  // Feed is owned by feedParser
     291            2 :                 discovered.title = feed->title.isEmpty() ? feedURL.toString() : feed->title;
     292            2 :                 discovered.content = "";  // Not storing raw content anymore
     293            2 :                 discovered.validated = true;
     294            2 :                 _discoveredFeeds.append(discovered);
     295            2 :             }
     296              :         }
     297            4 :     }
     298              : 
     299              :     // Check if we found any valid feeds
     300            2 :     if (_discoveredFeeds.isEmpty()) {
     301            1 :         reportError("No valid feeds found");
     302            1 :         return;
     303              :     }
     304              : 
     305              :     // Set the first valid feed as the primary one (for backward compatibility)
     306            1 :     _feedURL = _discoveredFeeds.first().url;
     307            1 :     _feedResult = _discoveredFeeds.first().feed;
     308              : 
     309              :     // Emit done signal
     310            1 :     machine.setState(FEED_FOUND);
     311            2 : }
     312              : 
     313            8 : void FeedDiscovery::reportError(QString errorString)
     314              : {
     315            8 :     _error = true;
     316            8 :     _errorString = errorString;
     317              : 
     318            8 :     machine.setState(FEED_ERROR);
     319            8 : }
        

Generated by: LCOV version 2.0-1