LCOV - code coverage report
Current view: top level - lib/FangFeedDiscovery - FeedDiscovery.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 94.0 % 265 249
Test Date: 2026-04-19 00:35:54 Functions: 93.5 % 31 29

            Line data    Source code
       1              : #include "FeedDiscovery.h"
       2              : #include "FeedDiscoveryLogging.h"
       3              : #include <QSimpleStateMachine/QSimpleStateMachine.h>
       4              : #include "PageMetadataExtractor.h"
       5              : #include <QXmlStreamReader>
       6              : #include <QSet>
       7              : #include <algorithm>
       8              : #include "WebUtilities.h"
       9              : #include "FeedSource.h"
      10              : #include "FeedFetchResult.h"
      11              : #include "FeedFetcher.h"
      12              : #include "BatchFeedFetcher.h"
      13              : #include "WebPageGrabber.h"
      14              : #include "NewsSitemapSynthesizer.h"
      15              : 
      16           71 : FeedDiscovery::FeedDiscovery(QObject *parent,
      17              :                            FeedSource* firstParser,
      18              :                            WebPageGrabber* pageGrabber,
      19              :                            BatchFeedFetcher* feedParser,
      20           71 :                            NewsSitemapSynthesizer* sitemapSynthesizer) :
      21              :     QObject(parent),
      22           71 :     machine(new QSimpleStateMachine(this)),
      23           71 :     _error(Error::None),
      24           71 :     _errorString(),
      25           71 :     _probingCommonPaths(false),
      26          213 :     newsSitemapSynthesizer(sitemapSynthesizer)
      27              : {
      28              :     // Create default implementations if not provided (with this as parent for auto-cleanup)
      29           71 :     parserFirstTry = firstParser ? firstParser : new FeedFetcher(this);
      30           71 :     this->pageGrabber = pageGrabber ? pageGrabber : new WebPageGrabber(this);
      31           71 :     this->feedParser = feedParser ? feedParser : new BatchFeedFetcher(this);
      32              : 
      33              :     // Take ownership of injected dependencies by setting parent
      34           71 :     if (parserFirstTry && !parserFirstTry->parent()) {
      35           30 :         parserFirstTry->setParent(this);
      36              :     }
      37           71 :     if (this->pageGrabber && !this->pageGrabber->parent()) {
      38           30 :         this->pageGrabber->setParent(this);
      39              :     }
      40           71 :     if (this->feedParser && !this->feedParser->parent()) {
      41           30 :         this->feedParser->setParent(this);
      42              :     }
      43           71 :     if (newsSitemapSynthesizer && !newsSitemapSynthesizer->parent()) {
      44           12 :         newsSitemapSynthesizer->setParent(this);
      45              :     }
      46              : 
      47              :     // Set up our state machine.
      48          110 :     machine->addStateChange(CHECK_FEED, TRY_FEED, [this]() { onTryFeed(); });
      49           88 :     machine->addStateChange(TRY_FEED, FEED_FOUND, [this]() { onFeedFound(); });
      50           92 :     machine->addStateChange(TRY_FEED, WEB_GRABBER, [this]() { onWebGrabber(); });
      51           78 :     machine->addStateChange(WEB_GRABBER, VALIDATE_FEEDS, [this]() { onValidateFeeds(); });
      52           75 :     machine->addStateChange(VALIDATE_FEEDS, FEED_FOUND, [this]() { onFeedFound(); });
      53           85 :     machine->addStateChange(WEB_GRABBER, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
      54           74 :     machine->addStateChange(VALIDATE_FEEDS, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
      55           74 :     machine->addStateChange(TRY_COMMON_PATHS, FEED_FOUND, [this]() { onFeedFound(); });
      56           85 :     machine->addStateChange(TRY_COMMON_PATHS, TRY_NEWS_SITEMAP, [this]() { onTryNewsSitemap(); });
      57           73 :     machine->addStateChange(TRY_NEWS_SITEMAP, FEED_FOUND, [this]() { onFeedFound(); });
      58              : 
      59           83 :     machine->addStateChange(-1, FEED_ERROR, [this]() { onError(); }); // All errors.
      60              : 
      61              :     // Overall discovery timeout.
      62           71 :     timeoutTimer.setSingleShot(true);
      63           71 :     timeoutTimer.setInterval(30000);
      64           71 :     connect(&timeoutTimer, &QTimer::timeout, this, &FeedDiscovery::onTimeout);
      65              : 
      66              :     // Parser signals.
      67           71 :     connect(parserFirstTry, &FeedSource::done, this, &FeedDiscovery::onFirstParseDone);
      68              : 
      69              :     // Web page grabber signals.
      70           71 :     connect(this->pageGrabber, &WebPageGrabber::ready, this, &FeedDiscovery::onPageGrabberReady);
      71           71 :     connect(this->feedParser, &BatchFeedFetcher::ready, this, &FeedDiscovery::onFeedParserReady);
      72           71 : }
      73              : 
      74           71 : FeedDiscovery::~FeedDiscovery()
      75              : {
      76              :     // Qt parent/child hierarchy handles cleanup automatically
      77           71 : }
      78              : 
      79           39 : void FeedDiscovery::checkFeed(QString sURL)
      80              : {
      81              :     // Reset state
      82           39 :     _error = Error::None;
      83           39 :     _errorString.clear();
      84           39 :     _probingCommonPaths = false;
      85           39 :     _discoveredFeeds.clear();
      86           39 :     _sortedFeedURLs.clear();
      87           39 :     machine->start(CHECK_FEED);
      88              : 
      89           39 :     QUrl url = WebUtilities::urlFixup(sURL);
      90              :     
      91              :     // Make sure the location isn't a "relative" (and therefore severely invalid) path.
      92           39 :     if (url.isRelative() || url.scheme().isEmpty()) {
      93              :         // Try adjusting the scheme.
      94            0 :         if (url.scheme() == "") {
      95            0 :             url.setScheme("http");
      96              :         }
      97              :         
      98              :         //qCDebug(logFeedDiscovery) << "Location is adjusted to: " << location;
      99              :         
     100              :         // Final check!  If it's not valid, we'll set an error and bail.
     101            0 :         if (url.isRelative()) {
     102            0 :             reportError(Error::InvalidURL, "Invalid URL");
     103              :             
     104            0 :             return;
     105              :         }
     106              :     }
     107              :     
     108              :     // Okay, we have a potential URL! Let's check it.
     109           39 :     _feedURL = url;
     110           39 :     machine->setState(TRY_FEED);
     111           39 :     timeoutTimer.start();
     112           39 : }
     113              : 
     114           39 : void FeedDiscovery::onTryFeed()
     115              : {
     116           39 :     parserFirstTry->parse(_feedURL);
     117           39 : }
     118              : 
     119           26 : void FeedDiscovery::onFeedFound()
     120              : {
     121           26 :     timeoutTimer.stop();
     122           26 :     Q_ASSERT(_error == Error::None);
     123           26 :     Q_ASSERT(!_feedURL.isEmpty());
     124              : 
     125           26 :     emit done(this);
     126           26 : }
     127              : 
     128           21 : void FeedDiscovery::onWebGrabber()
     129              : {
     130           21 :     pageGrabber->load(_feedURL);
     131           21 : }
     132              : 
     133           12 : void FeedDiscovery::onError()
     134              : {
     135           12 :     timeoutTimer.stop();
     136           12 :     Q_ASSERT(_error != Error::None);
     137           12 :     Q_ASSERT(!_errorString.isEmpty());
     138              : 
     139           12 :     emit done(this);
     140           12 : }
     141              : 
     142            0 : void FeedDiscovery::onTimeout()
     143              : {
     144            0 :     reportError(Error::Timeout, "Feed discovery timed out");
     145            0 : }
     146              : 
     147           38 : void FeedDiscovery::onFirstParseDone()
     148              : {
     149           38 :     FeedFetchResult res = parserFirstTry->getResult();
     150           38 :     switch (res) {
     151           18 :     case FeedFetchResult::OK:
     152              :     {
     153              :         // User directly entered a feed URL! Add it to discovered feeds
     154           18 :         _feedURL = parserFirstTry->getURL();
     155           18 :         auto parsedFeed = parserFirstTry->getFeed();
     156              : 
     157              :         // Reject empty feeds - a feed that parses OK but has no items is useless.
     158           18 :         if (!parsedFeed || parsedFeed->items.isEmpty()) {
     159            2 :             qCDebug(logFeedDiscovery) << "Feed parsed OK but has no items, trying web grabber";
     160            1 :             machine->setState(WEB_GRABBER);
     161            1 :             break;
     162              :         }
     163              : 
     164              :         // Add to discovered feeds list
     165           17 :         DiscoveredFeed discovered;
     166           17 :         discovered.url = _feedURL;
     167           17 :         discovered.feed = parsedFeed;
     168           17 :         discovered.title = parsedFeed->title.isEmpty() ? _feedURL.toString() : parsedFeed->title;
     169           17 :         discovered.validated = true;
     170           17 :         _discoveredFeeds.clear();
     171           17 :         _discoveredFeeds.append(discovered);
     172              : 
     173           17 :         machine->setState(FEED_FOUND);
     174           17 :         break;
     175           18 :     }
     176              : 
     177           20 :     case FeedFetchResult::NetworkError:
     178              :     case FeedFetchResult::FileError:
     179              :     case FeedFetchResult::EmptyDocument:
     180              :     case FeedFetchResult::ParseError:
     181              :         // Not a feed, probably HTML. Continue to the web grabber stage.
     182           20 :         machine->setState(WEB_GRABBER);
     183           20 :         break;
     184              : 
     185            0 :     case FeedFetchResult::InProgress:
     186              :     default:
     187            0 :         qCCritical(logFeedDiscovery) << "Unexpected parser result in onFirstParseDone";
     188            0 :         Q_UNREACHABLE();
     189              :         // Treat as error and continue to web grabber
     190              :         machine->setState(WEB_GRABBER);
     191              :         break;
     192              :     }
     193           38 : }
     194              : 
     195           21 : void FeedDiscovery::onPageGrabberReady(WebPageGrabber* grabber, QString* document)
     196              : {
     197              :     Q_UNUSED(grabber);
     198              : 
     199              :     // If we didn't get a document, try common paths before giving up.
     200           21 :     if (!document || document->isEmpty()) {
     201           14 :         qCDebug(logFeedDiscovery) << "No page found, trying common paths";
     202            7 :         machine->setState(TRY_COMMON_PATHS);
     203           14 :         return;
     204              :     }
     205              : 
     206              :     // Parse feed URLs from the HTML document
     207           14 :     QList<QString> feedURLs = parseFeedsFromXHTML(*document);
     208           28 :     qCDebug(logFeedDiscovery) << "Parsed" << feedURLs.count() << "feed URLs from HTML";
     209              : 
     210           14 :     if (feedURLs.isEmpty()) {
     211           14 :         qCDebug(logFeedDiscovery) << "No feeds found in HTML, trying common paths";
     212            7 :         _pageXHTML = *document;
     213            7 :         machine->setState(TRY_COMMON_PATHS);
     214            7 :         return;
     215              :     }
     216              : 
     217           14 :     qCDebug(logFeedDiscovery) << "Total feed URLs found:" << feedURLs.count();
     218              : 
     219              :     // Sort by path length (longer paths first = more specific)
     220            7 :     QList<QString> feedURLStrings = feedURLs;
     221            7 :     std::sort(feedURLStrings.begin(), feedURLStrings.end(),
     222            3 :         [](const QString& a, const QString& b) {
     223            3 :             QUrl urlA(a);
     224            3 :             QUrl urlB(b);
     225            6 :             return urlA.path().length() > urlB.path().length();
     226            3 :         });
     227              : 
     228              :     // Convert to QUrl list and store for validation
     229            7 :     _sortedFeedURLs.clear();
     230           16 :     for (const QString& urlString : feedURLStrings) {
     231            9 :         QUrl feedUrl(urlString);
     232              : 
     233              :         // Fix relative URLs.
     234            9 :         if (feedUrl.isRelative()) {
     235            2 :             feedUrl = _feedURL.resolved(feedUrl);
     236              :         }
     237            9 :         _sortedFeedURLs.append(feedUrl);
     238            9 :     }
     239              : 
     240              :     // Trigger bulk feed validation
     241            7 :     machine->setState(VALIDATE_FEEDS);
     242           14 : }
     243              : 
     244           55 : QList<QString> FeedDiscovery::parseFeedsFromXHTML(const QString& document)
     245              : {
     246           55 :     QList<QString> feedsFound;
     247              : 
     248              :     // Examples of what we're looking for:
     249              :     // <link rel="alternate" href="http://www.fark.com/fark.rss" type="application/rss+xml" title="FARK.com Fark RSS Feed">
     250              :     // <link rel="alternate" type="application/rss+xml" title="MrEricSir.com RSS Feed" href="http://www.mrericsir.com/blog/feed/" />
     251              :     // <link rel="alternate" type="application/atom+xml" title="MrEricSir.com Atom Feed" href="http://www.mrericsir.com/blog/feed/atom/" />
     252           55 :     const QString S_REL = "rel";
     253           55 :     const QString S_HREF = "href";
     254           55 :     const QString S_TYPE = "type";
     255           55 :     const QString S_TITLE = "title";
     256           55 :     const QString S_WORDPRESS_COMMENTS_URL_SUFFIX = "/comments/feed/";
     257              : 
     258           55 :     QXmlStreamReader xml;
     259           55 :     xml.addData(document);
     260              : 
     261        10790 :     while (!xml.atEnd()) {
     262              :         // Grab the next thingie.
     263        10734 :         xml.readNext();
     264              : 
     265        10734 :         if (xml.isStartElement()) {
     266         2790 :             QString tagName = xml.name().toString().toLower();
     267         2790 :             if (tagName == "body") {
     268              :                 // We're done with the header, so bail.
     269           54 :                 return feedsFound;
     270              :             }
     271              : 
     272         2736 :             if (tagName == "link") {
     273          887 :                 QXmlStreamAttributes attributes = xml.attributes();
     274              : 
     275              :                 // Is this a feed?
     276         2629 :                 if (attributes.hasAttribute(S_REL) && attributes.hasAttribute(S_HREF) &&
     277         2751 :                     attributes.value("", S_REL).toString().toLower() == "alternate" &&
     278         2821 :                     attributes.hasAttribute(S_TYPE) &&
     279         1024 :                     (attributes.value("", S_TYPE).toString().toLower() == "application/rss+xml" ||
     280          928 :                      attributes.value("", S_TYPE).toString().toLower() == "application/atom+xml" ||
     281          909 :                      attributes.value("", S_TYPE).toString().toLower() == "application/feed+json")) {
     282              :                     // Run some checks and then add our feed if it seems reasonable to do so.
     283          102 :                     QString url = attributes.value("", S_HREF).toString();
     284              : 
     285              :                     // Avoid comments feeds as they tend to get added by accident.
     286           51 :                     if (url.endsWith(S_WORDPRESS_COMMENTS_URL_SUFFIX)) {
     287            8 :                         continue;
     288              :                     }
     289              : 
     290              :                     // Strip trailing slash from feed paths. Some servers (e.g. cbsnews.com)
     291              :                     // return 404 for trailing-slash feed URLs but 200 without.
     292           43 :                     if (url.endsWith("/") && !url.endsWith("://")) {
     293           18 :                         url.chop(1);
     294              :                     }
     295              : 
     296           43 :                     feedsFound << url;
     297           51 :                 }
     298          887 :             }
     299         2790 :         }
     300              :     }
     301              : 
     302            1 :     return feedsFound;
     303           55 : }
     304              : 
     305            7 : void FeedDiscovery::onValidateFeeds()
     306              : {
     307              :     // Use the sorted feed URLs from onPageGrabberReady
     308            7 :     if (_sortedFeedURLs.isEmpty()) {
     309            0 :         reportError(Error::NoFeedsFound, "No feeds to validate");
     310            0 :         return;
     311              :     }
     312              : 
     313              :     // Bulk parse all feed URLs
     314            7 :     feedParser->parse(_sortedFeedURLs);
     315              : }
     316              : 
     317           24 : void FeedDiscovery::onFeedParserReady()
     318              : {
     319              :     // Process all parsed feeds
     320           24 :     _discoveredFeeds.clear();
     321              : 
     322           24 :     QMap<QUrl, FeedFetchResult> results = feedParser->getResults();
     323          186 :     for (auto it = results.constBegin(); it != results.constEnd(); ++it) {
     324          162 :         QUrl feedURL = it.key();
     325          162 :         FeedFetchResult result = it.value();
     326              : 
     327              :         // Only include successfully parsed feeds that have items.
     328          162 :         if (result == FeedFetchResult::OK) {
     329           10 :             auto feed = feedParser->getFeed(feedURL);
     330           10 :             if (feed && !feed->items.isEmpty()) {
     331            8 :                 DiscoveredFeed discovered;
     332            8 :                 discovered.url = feedURL;
     333            8 :                 discovered.feed = feed;
     334            8 :                 discovered.title = feed->title.isEmpty() ? feedURL.toString() : feed->title;
     335            8 :                 discovered.validated = true;
     336            8 :                 _discoveredFeeds.append(discovered);
     337            8 :             }
     338           10 :         }
     339          162 :     }
     340              : 
     341              :     // Check if we found any valid feeds.
     342           24 :     if (_discoveredFeeds.isEmpty()) {
     343           17 :         if (_probingCommonPaths) {
     344              :             // Common paths didn't turn up anything.
     345           28 :             qCDebug(logFeedDiscovery) << "No valid feeds at common paths, trying sitemap";
     346           14 :             _probingCommonPaths = false;
     347           14 :             machine->setState(TRY_NEWS_SITEMAP);
     348              :         } else {
     349              :             // Validation of HTML-discovered feeds failed.
     350            6 :             qCDebug(logFeedDiscovery) << "No valid feeds found, trying common paths";
     351            3 :             machine->setState(TRY_COMMON_PATHS);
     352              :         }
     353           17 :         return;
     354              :     }
     355              : 
     356            7 :     _probingCommonPaths = false;
     357              : 
     358              :     // Set the first valid feed as the primary one (for backward compatibility)
     359            7 :     _feedURL = _discoveredFeeds.first().url;
     360              : 
     361              :     // Emit done signal
     362            7 :     machine->setState(FEED_FOUND);
     363           24 : }
     364              : 
     365           17 : QStringList FeedDiscovery::commonFeedPaths()
     366              : {
     367              :     return {
     368              :         "/feed",
     369              :         "/rss",
     370              :         "/feed.xml",
     371              :         "/feed.json",
     372              :         "/rss.xml",
     373              :         "/rss2.0.xml",
     374              :         "/atom.xml",
     375              :         "/index.xml",
     376              :         "/blog/feed"
     377          170 :     };
     378           17 : }
     379              : 
     380           17 : void FeedDiscovery::onTryCommonPaths()
     381              : {
     382           17 :     QUrl rootUrl;
     383           17 :     rootUrl.setScheme(_feedURL.scheme());
     384           17 :     rootUrl.setHost(_feedURL.host());
     385           17 :     if (_feedURL.port() != -1) {
     386            0 :         rootUrl.setPort(_feedURL.port());
     387              :     }
     388              : 
     389           17 :     QList<QUrl> probeURLs;
     390          170 :     for (const QString& path : commonFeedPaths()) {
     391          153 :         QUrl probeUrl = rootUrl;
     392          153 :         probeUrl.setPath(path);
     393          153 :         probeURLs.append(probeUrl);
     394          170 :     }
     395              : 
     396           34 :     qCDebug(logFeedDiscovery) << "Probing" << probeURLs.count() << "common feed paths";
     397           17 :     _probingCommonPaths = true;
     398           17 :     feedParser->parse(probeURLs);
     399           17 : }
     400              : 
     401           14 : void FeedDiscovery::onTryNewsSitemap()
     402              : {
     403              :     // Extract site title from the already-fetched XHTML (if available).
     404              :     // The page content may be a redirect or error page, so validate the title.
     405           14 :     QString siteTitle;
     406           14 :     if (!_pageXHTML.isEmpty()) {
     407            5 :         PageMetadata meta = PageMetadataExtractor::extract(_pageXHTML);
     408              :         // Reject titles that look like HTTP status messages.
     409           15 :         if (!meta.title.isEmpty()
     410           10 :             && !meta.title.contains("Moved", Qt::CaseInsensitive)
     411           10 :             && !meta.title.contains("Forbidden", Qt::CaseInsensitive)
     412           10 :             && !meta.title.contains("Not Found", Qt::CaseInsensitive)
     413           10 :             && !meta.title.contains("Error", Qt::CaseInsensitive)) {
     414            5 :             siteTitle = meta.title;
     415              :         }
     416            5 :     }
     417           14 :     if (siteTitle.isEmpty()) {
     418            9 :         siteTitle = _feedURL.host();
     419              :     }
     420              : 
     421           28 :     qCDebug(logFeedDiscovery) << "FeedDiscovery: trying sitemap for" << _feedURL
     422           14 :                         << "with title" << siteTitle;
     423              : 
     424           14 :     if (!newsSitemapSynthesizer) {
     425            2 :         newsSitemapSynthesizer = new NewsSitemapSynthesizer(this);
     426              :     }
     427           14 :     connect(newsSitemapSynthesizer, &NewsSitemapSynthesizer::done,
     428           14 :             this, &FeedDiscovery::onNewsSitemapDone, Qt::UniqueConnection);
     429           14 :     newsSitemapSynthesizer->synthesize(_feedURL, siteTitle);
     430           14 : }
     431              : 
     432           14 : void FeedDiscovery::onNewsSitemapDone()
     433              : {
     434           14 :     if (newsSitemapSynthesizer->hasError()) {
     435           12 :         reportError(Error::NoFeedsFound, newsSitemapSynthesizer->errorString());
     436           12 :         return;
     437              :     }
     438              : 
     439            2 :     auto synthFeed = newsSitemapSynthesizer->result();
     440            2 :     if (!synthFeed || synthFeed->items.isEmpty()) {
     441            0 :         reportError(Error::NoFeedsFound, "No feed found");
     442            0 :         return;
     443              :     }
     444              : 
     445              :     // Set the primary feed result.
     446            2 :     _feedURL = synthFeed->url;
     447              : 
     448              :     // Add to discovered feeds list.
     449            2 :     DiscoveredFeed discovered;
     450            2 :     discovered.url = synthFeed->url;
     451            2 :     discovered.feed = synthFeed;
     452            2 :     discovered.title = synthFeed->title;
     453            2 :     discovered.validated = true;
     454            2 :     _discoveredFeeds.clear();
     455            2 :     _discoveredFeeds.append(discovered);
     456              : 
     457            2 :     machine->setState(FEED_FOUND);
     458            2 : }
     459              : 
     460           12 : void FeedDiscovery::reportError(Error error, const QString& errorString)
     461              : {
     462           12 :     _error = error;
     463           12 :     _errorString = errorString;
     464              : 
     465           12 :     machine->setState(FEED_ERROR);
     466           12 : }
        

Generated by: LCOV version 2.0-1