LCOV - code coverage report
Current view: top level - src/utilities - FeedDiscovery.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 94.4 % 269 254
Test Date: 2026-03-23 10:19:47 Functions: 93.5 % 31 29

            Line data    Source code
       1              : #include "FeedDiscovery.h"
       2              : #include "FangLogging.h"
       3              : #include "PageMetadataExtractor.h"
       4              : #include <QXmlStreamReader>
       5              : #include <QSet>
       6              : #include <algorithm>
       7              : #include "NetworkUtilities.h"
       8              : #include "ErrorHandling.h"
       9              : #include "../parser/NewsParser.h"
      10              : #include "../parser/BatchNewsParser.h"
      11              : #include "WebPageGrabber.h"
      12              : 
      13           70 : FeedDiscovery::FeedDiscovery(QObject *parent,
      14              :                            ParserInterface* firstParser,
      15              :                            ParserInterface* secondParser,
      16              :                            WebPageGrabber* pageGrabber,
      17              :                            BatchNewsParser* feedParser,
      18           70 :                            GoogleNewsSitemapSynthesizer* sitemapSynthesizer) :
      19              :     FangObject(parent),
      20           70 :     machine(),
      21           70 :     _error(false),
      22           70 :     _errorString(""),
      23           70 :     _feedResult(nullptr),
      24           70 :     _probingCommonPaths(false),
      25          140 :     newsSitemapSynthesizer(sitemapSynthesizer)
      26              : {
      27              :     // Handle secondParser: no longer used, but we need to clean it up if provided
      28           70 :     if (secondParser) {
      29           30 :         if (!secondParser->parent()) {
      30           30 :             secondParser->setParent(this);  // Take ownership so it gets cleaned up
      31              :         }
      32              :     }
      33              : 
      34              :     // Create default implementations if not provided (with this as parent for auto-cleanup)
      35           70 :     parserFirstTry = firstParser ? firstParser : new NewsParser(this);
      36           70 :     this->pageGrabber = pageGrabber ? pageGrabber : new WebPageGrabber(this);
      37           70 :     this->feedParser = feedParser ? feedParser : new BatchNewsParser(this);
      38              : 
      39              :     // Take ownership of injected dependencies by setting parent
      40           70 :     if (parserFirstTry && !parserFirstTry->parent()) {
      41           30 :         parserFirstTry->setParent(this);
      42              :     }
      43           70 :     if (this->pageGrabber && !this->pageGrabber->parent()) {
      44           30 :         this->pageGrabber->setParent(this);
      45              :     }
      46           70 :     if (this->feedParser && !this->feedParser->parent()) {
      47           30 :         this->feedParser->setParent(this);
      48              :     }
      49           70 :     if (newsSitemapSynthesizer && !newsSitemapSynthesizer->parent()) {
      50           12 :         newsSitemapSynthesizer->setParent(this);
      51              :     }
      52              : 
      53              :     // Set up our state machine.
      54          109 :     machine.addStateChange(CHECK_FEED, TRY_FEED, [this]() { onTryFeed(); });
      55           87 :     machine.addStateChange(TRY_FEED, FEED_FOUND, [this]() { onFeedFound(); });
      56           91 :     machine.addStateChange(TRY_FEED, WEB_GRABBER, [this]() { onWebGrabber(); });
      57           77 :     machine.addStateChange(WEB_GRABBER, VALIDATE_FEEDS, [this]() { onValidateFeeds(); });
      58           74 :     machine.addStateChange(VALIDATE_FEEDS, FEED_FOUND, [this]() { onFeedFound(); });
      59           84 :     machine.addStateChange(WEB_GRABBER, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
      60           73 :     machine.addStateChange(VALIDATE_FEEDS, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
      61           73 :     machine.addStateChange(TRY_COMMON_PATHS, FEED_FOUND, [this]() { onFeedFound(); });
      62           84 :     machine.addStateChange(TRY_COMMON_PATHS, TRY_GOOGLE_NEWS_SITEMAP, [this]() { onTryGoogleNewsSitemap(); });
      63           72 :     machine.addStateChange(TRY_GOOGLE_NEWS_SITEMAP, FEED_FOUND, [this]() { onFeedFound(); });
      64              : 
      65           82 :     machine.addStateChange(-1, FEED_ERROR, [this]() { onError(); }); // All errors.
      66              : 
      67              :     // Overall discovery timeout.
      68           70 :     timeoutTimer.setSingleShot(true);
      69           70 :     timeoutTimer.setInterval(30000);
      70           70 :     connect(&timeoutTimer, &QTimer::timeout, this, &FeedDiscovery::onTimeout);
      71              : 
      72              :     // Parser signals.
      73           70 :     connect(parserFirstTry, &ParserInterface::done, this, &FeedDiscovery::onFirstParseDone);
      74              : 
      75              :     // Web page grabber signals.
      76           70 :     connect(this->pageGrabber, &WebPageGrabber::ready, this, &FeedDiscovery::onPageGrabberReady);
      77           70 :     connect(this->feedParser, &BatchNewsParser::ready, this, &FeedDiscovery::onFeedParserReady);
      78           70 : }
      79              : 
      80           70 : FeedDiscovery::~FeedDiscovery()
      81              : {
      82              :     // Qt parent/child hierarchy handles cleanup automatically
      83           70 : }
      84              : 
      85           39 : void FeedDiscovery::checkFeed(QString sURL)
      86              : {
      87              :     // Reset state
      88           39 :     _error = false;
      89           39 :     _errorString = "";
      90           39 :     _probingCommonPaths = false;
      91           39 :     _discoveredFeeds.clear();
      92           39 :     _sortedFeedURLs.clear();
      93           39 :     machine.start(CHECK_FEED);
      94              : 
      95           39 :     QUrl url = NetworkUtilities::urlFixup(sURL);
      96              :     
      97              :     // Make sure the location isn't a "relative" (and therefore severely invalid) path.
      98           39 :     if (url.isRelative() || url.scheme().isEmpty()) {
      99              :         // Try adjusting the scheme.
     100            0 :         if (url.scheme() == "") {
     101            0 :             url.setScheme("http");
     102              :         }
     103              :         
     104              :         //qCDebug(logUtility) << "Location is adjusted to: " << location;
     105              :         
     106              :         // Final check!  If it's not valid, we'll set an error and bail.
     107            0 :         if (url.isRelative()) {
     108            0 :             reportError("Invalid URL");
     109              :             
     110            0 :             return;
     111              :         }
     112              :     }
     113              :     
     114              :     // Okay, we have a potential URL! Let's check it.
     115           39 :     _feedURL = url;
     116           39 :     machine.setState(TRY_FEED);
     117           39 :     timeoutTimer.start();
     118           39 : }
     119              : 
     120           39 : void FeedDiscovery::onTryFeed()
     121              : {
     122           39 :     parserFirstTry->parse(_feedURL);
     123           39 : }
     124              : 
     125           26 : void FeedDiscovery::onFeedFound()
     126              : {
     127           26 :     timeoutTimer.stop();
     128           26 :     FANG_CHECK(!_error, "FeedDiscovery::onFeedFound called with _error set");
     129           26 :     FANG_CHECK(!_feedURL.isEmpty(), "FeedDiscovery::onFeedFound called with empty _feedURL");
     130              : 
     131           26 :     emit done(this);
     132           26 : }
     133              : 
     134           21 : void FeedDiscovery::onWebGrabber()
     135              : {
     136           21 :     pageGrabber->load(_feedURL);
     137           21 : }
     138              : 
     139           12 : void FeedDiscovery::onError()
     140              : {
     141           12 :     timeoutTimer.stop();
     142           12 :     FANG_CHECK(_error, "FeedDiscovery::onError called without _error set");
     143           12 :     FANG_CHECK(!_errorString.isEmpty(), "FeedDiscovery::onError called with empty _errorString");
     144              : 
     145           12 :     emit done(this);
     146           12 : }
     147              : 
     148            0 : void FeedDiscovery::onTimeout()
     149              : {
     150            0 :     reportError("Feed discovery timed out");
     151            0 : }
     152              : 
     153           38 : void FeedDiscovery::onFirstParseDone()
     154              : {
     155           38 :     int res = parserFirstTry->getResult();
     156           38 :     switch (res) {
     157           18 :     case ParserInterface::OK:
     158              :     {
     159              :         // User directly entered a feed URL! Add it to discovered feeds
     160           18 :         _feedURL = parserFirstTry->getURL();
     161           18 :         _feedResult = parserFirstTry->getFeed();
     162              : 
     163              :         // Reject empty feeds - a feed that parses OK but has no items is useless.
     164           18 :         if (!_feedResult || _feedResult->items.isEmpty()) {
     165            2 :             qCDebug(logUtility) << "Feed parsed OK but has no items, trying web grabber";
     166            1 :             machine.setState(WEB_GRABBER);
     167            1 :             break;
     168              :         }
     169              : 
     170              :         // Add to discovered feeds list
     171           17 :         DiscoveredFeed discovered;
     172           17 :         discovered.url = _feedURL;
     173           17 :         discovered.feed = _feedResult;
     174           17 :         discovered.title = _feedResult->title.isEmpty() ? _feedURL.toString() : _feedResult->title;
     175           17 :         discovered.validated = true;
     176           17 :         _discoveredFeeds.clear();
     177           17 :         _discoveredFeeds.append(discovered);
     178              : 
     179           17 :         machine.setState(FEED_FOUND);
     180           17 :         break;
     181           17 :     }
     182              : 
     183           20 :     case ParserInterface::NETWORK_ERROR:
     184              :     case ParserInterface::FILE_ERROR:
     185              :     case ParserInterface::EMPTY_DOCUMENT:
     186              :     case ParserInterface::PARSE_ERROR:
     187              :         // Not a feed, probably HTML. Continue to the web grabber stage.
     188           20 :         machine.setState(WEB_GRABBER);
     189           20 :         break;
     190              : 
     191            0 :     case ParserInterface::IN_PROGRESS:
     192              :     default:
     193            0 :         FANG_UNREACHABLE("Unexpected parser result in onFirstParseDone");
     194              :         // Treat as error and continue to web grabber
     195              :         machine.setState(WEB_GRABBER);
     196              :         break;
     197              :     }
     198           38 : }
     199              : 
     200           21 : void FeedDiscovery::onPageGrabberReady(WebPageGrabber* grabber, QString* document)
     201              : {
     202              :     Q_UNUSED(grabber);
     203              : 
     204              :     // If we didn't get a document, try common paths before giving up.
     205           21 :     if (!document || document->isEmpty()) {
     206           14 :         qCDebug(logUtility) << "No page found, trying common paths";
     207            7 :         machine.setState(TRY_COMMON_PATHS);
     208           14 :         return;
     209              :     }
     210              : 
     211              :     // Parse feed URLs from the HTML document
     212           14 :     QList<QString> feedURLs = parseFeedsFromXHTML(*document);
     213           28 :     qCDebug(logUtility) << "Parsed" << feedURLs.count() << "feed URLs from HTML";
     214              : 
     215           14 :     if (feedURLs.isEmpty()) {
     216           14 :         qCDebug(logUtility) << "No feeds found in HTML, trying common paths";
     217            7 :         _pageXHTML = *document;
     218            7 :         machine.setState(TRY_COMMON_PATHS);
     219            7 :         return;
     220              :     }
     221              : 
     222           14 :     qCDebug(logUtility) << "Total feed URLs found:" << feedURLs.count();
     223              : 
     224              :     // Sort by path length (longer paths first = more specific)
     225            7 :     QList<QString> feedURLStrings = feedURLs;
     226            7 :     std::sort(feedURLStrings.begin(), feedURLStrings.end(),
     227            3 :         [](const QString& a, const QString& b) {
     228            3 :             QUrl urlA(a);
     229            3 :             QUrl urlB(b);
     230            6 :             return urlA.path().length() > urlB.path().length();
     231            3 :         });
     232              : 
     233              :     // Convert to QUrl list and store for validation
     234            7 :     _sortedFeedURLs.clear();
     235           16 :     for (const QString& urlString : feedURLStrings) {
     236            9 :         QUrl feedUrl(urlString);
     237              : 
     238              :         // Fix relative URLs.
     239            9 :         if (feedUrl.isRelative()) {
     240            2 :             feedUrl = _feedURL.resolved(feedUrl);
     241              :         }
     242            9 :         _sortedFeedURLs.append(feedUrl);
     243            9 :     }
     244              : 
     245              :     // Trigger bulk feed validation
     246            7 :     machine.setState(VALIDATE_FEEDS);
     247           14 : }
     248              : 
     249           54 : QList<QString> FeedDiscovery::parseFeedsFromXHTML(const QString& document)
     250              : {
     251           54 :     QList<QString> feedsFound;
     252              : 
     253              :     // Examples of what we're looking for:
     254              :     // <link rel="alternate" href="http://www.fark.com/fark.rss" type="application/rss+xml" title="FARK.com Fark RSS Feed">
     255              :     // <link rel="alternate" type="application/rss+xml" title="MrEricSir.com RSS Feed" href="http://www.mrericsir.com/blog/feed/" />
     256              :     // <link rel="alternate" type="application/atom+xml" title="MrEricSir.com Atom Feed" href="http://www.mrericsir.com/blog/feed/atom/" />
     257           54 :     const QString S_REL = "rel";
     258           54 :     const QString S_HREF = "href";
     259           54 :     const QString S_TYPE = "type";
     260           54 :     const QString S_TITLE = "title";
     261           54 :     const QString S_WORDPRESS_COMMENTS_URL_SUFFIX = "/comments/feed/";
     262              : 
     263           54 :     QXmlStreamReader xml;
     264           54 :     xml.addData(document);
     265              : 
     266        10767 :     while (!xml.atEnd()) {
     267              :         // Grab the next thingie.
     268        10712 :         xml.readNext();
     269              : 
     270        10712 :         if (xml.isStartElement()) {
     271         2783 :             QString tagName = xml.name().toString().toLower();
     272         2783 :             if (tagName == "body") {
     273              :                 // We're done with the header, so bail.
     274           53 :                 return feedsFound;
     275              :             }
     276              : 
     277         2730 :             if (tagName == "link") {
     278          885 :                 QXmlStreamAttributes attributes = xml.attributes();
     279              : 
     280              :                 // Is this a feed?
     281         2623 :                 if (attributes.hasAttribute(S_REL) && attributes.hasAttribute(S_HREF) &&
     282         2743 :                     attributes.value("", S_REL).toString().toLower() == "alternate" &&
     283         2812 :                     attributes.hasAttribute(S_TYPE) &&
     284         1017 :                     (attributes.value("", S_TYPE).toString().toLower() == "application/rss+xml" ||
     285          913 :                      attributes.value("", S_TYPE).toString().toLower() == "application/atom+xml")) {
     286              :                     // Run some checks and then add our feed if it seems reasonable to do so.
     287           98 :                     QString url = attributes.value("", S_HREF).toString();
     288              : 
     289              :                     // Avoid comments feeds as they tend to get added by accident.
     290           49 :                     if (url.endsWith(S_WORDPRESS_COMMENTS_URL_SUFFIX)) {
     291            8 :                         continue;
     292              :                     }
     293              : 
     294              :                     // Strip trailing slash from feed paths. Some servers (e.g. cbsnews.com)
     295              :                     // return 404 for trailing-slash feed URLs but 200 without.
     296           41 :                     if (url.endsWith("/") && !url.endsWith("://")) {
     297           18 :                         url.chop(1);
     298              :                     }
     299              : 
     300           41 :                     feedsFound << url;
     301           49 :                 }
     302          885 :             }
     303         2783 :         }
     304              :     }
     305              : 
     306            1 :     return feedsFound;
     307           54 : }
     308              : 
     309            7 : void FeedDiscovery::onValidateFeeds()
     310              : {
     311              :     // Use the sorted feed URLs from onPageGrabberReady
     312            7 :     if (_sortedFeedURLs.isEmpty()) {
     313            0 :         reportError("No feeds to validate");
     314            0 :         return;
     315              :     }
     316              : 
     317              :     // Bulk parse all feed URLs
     318            7 :     feedParser->parse(_sortedFeedURLs);
     319              : }
     320              : 
     321           24 : void FeedDiscovery::onFeedParserReady()
     322              : {
     323              :     // Process all parsed feeds
     324           24 :     _discoveredFeeds.clear();
     325              : 
     326           24 :     QMap<QUrl, ParserInterface::ParseResult> results = feedParser->getResults();
     327          169 :     for (auto it = results.constBegin(); it != results.constEnd(); ++it) {
     328          145 :         QUrl feedURL = it.key();
     329          145 :         ParserInterface::ParseResult result = it.value();
     330              : 
     331              :         // Only include successfully parsed feeds that have items.
     332          145 :         if (result == ParserInterface::OK) {
     333           10 :             RawFeed* feed = feedParser->getFeed(feedURL);
     334           10 :             if (feed && !feed->items.isEmpty()) {
     335            8 :                 DiscoveredFeed discovered;
     336            8 :                 discovered.url = feedURL;
     337            8 :                 discovered.feed = feed;  // Feed is owned by feedParser
     338            8 :                 discovered.title = feed->title.isEmpty() ? feedURL.toString() : feed->title;
     339            8 :                 discovered.content = "";  // Not storing raw content anymore
     340            8 :                 discovered.validated = true;
     341            8 :                 _discoveredFeeds.append(discovered);
     342            8 :             }
     343              :         }
     344          145 :     }
     345              : 
     346              :     // Check if we found any valid feeds.
     347           24 :     if (_discoveredFeeds.isEmpty()) {
     348           17 :         if (_probingCommonPaths) {
     349              :             // Common paths didn't turn up anything.
     350           28 :             qCDebug(logUtility) << "No valid feeds at common paths, trying sitemap";
     351           14 :             _probingCommonPaths = false;
     352           14 :             machine.setState(TRY_GOOGLE_NEWS_SITEMAP);
     353              :         } else {
     354              :             // Validation of HTML-discovered feeds failed.
     355            6 :             qCDebug(logUtility) << "No valid feeds found, trying common paths";
     356            3 :             machine.setState(TRY_COMMON_PATHS);
     357              :         }
     358           17 :         return;
     359              :     }
     360              : 
     361            7 :     _probingCommonPaths = false;
     362              : 
     363              :     // Set the first valid feed as the primary one (for backward compatibility)
     364            7 :     _feedURL = _discoveredFeeds.first().url;
     365            7 :     _feedResult = _discoveredFeeds.first().feed;
     366              : 
     367              :     // Emit done signal
     368            7 :     machine.setState(FEED_FOUND);
     369           24 : }
     370              : 
     371           17 : QStringList FeedDiscovery::commonFeedPaths()
     372              : {
     373              :     return {
     374              :         "/feed",
     375              :         "/rss",
     376              :         "/feed.xml",
     377              :         "/rss.xml",
     378              :         "/rss2.0.xml",
     379              :         "/atom.xml",
     380              :         "/index.xml",
     381              :         "/blog/feed"
     382          153 :     };
     383           17 : }
     384              : 
     385           17 : void FeedDiscovery::onTryCommonPaths()
     386              : {
     387           17 :     QUrl rootUrl;
     388           17 :     rootUrl.setScheme(_feedURL.scheme());
     389           17 :     rootUrl.setHost(_feedURL.host());
     390           17 :     if (_feedURL.port() != -1) {
     391            0 :         rootUrl.setPort(_feedURL.port());
     392              :     }
     393              : 
     394           17 :     QList<QUrl> probeURLs;
     395          153 :     for (const QString& path : commonFeedPaths()) {
     396          136 :         QUrl probeUrl = rootUrl;
     397          136 :         probeUrl.setPath(path);
     398          136 :         probeURLs.append(probeUrl);
     399          153 :     }
     400              : 
     401           34 :     qCDebug(logUtility) << "Probing" << probeURLs.count() << "common feed paths";
     402           17 :     _probingCommonPaths = true;
     403           17 :     feedParser->parse(probeURLs);
     404           17 : }
     405              : 
     406           14 : void FeedDiscovery::onTryGoogleNewsSitemap()
     407              : {
     408              :     // Extract site title from the already-fetched XHTML (if available).
     409              :     // The page content may be a redirect or error page, so validate the title.
     410           14 :     QString siteTitle;
     411           14 :     if (!_pageXHTML.isEmpty()) {
     412            5 :         PageMetadata meta = PageMetadataExtractor::extract(_pageXHTML);
     413              :         // Reject titles that look like HTTP status messages.
     414           15 :         if (!meta.title.isEmpty()
     415           10 :             && !meta.title.contains("Moved", Qt::CaseInsensitive)
     416           10 :             && !meta.title.contains("Forbidden", Qt::CaseInsensitive)
     417           10 :             && !meta.title.contains("Not Found", Qt::CaseInsensitive)
     418           10 :             && !meta.title.contains("Error", Qt::CaseInsensitive)) {
     419            5 :             siteTitle = meta.title;
     420              :         }
     421            5 :     }
     422           14 :     if (siteTitle.isEmpty()) {
     423            9 :         siteTitle = _feedURL.host();
     424              :     }
     425              : 
     426           28 :     qCDebug(logUtility) << "FeedDiscovery: trying sitemap for" << _feedURL
     427           14 :                         << "with title" << siteTitle;
     428              : 
     429           14 :     if (!newsSitemapSynthesizer) {
     430            2 :         newsSitemapSynthesizer = new GoogleNewsSitemapSynthesizer(this);
     431              :     }
     432           14 :     connect(newsSitemapSynthesizer, &GoogleNewsSitemapSynthesizer::done,
     433           14 :             this, &FeedDiscovery::onNewsSitemapDone, Qt::UniqueConnection);
     434           14 :     newsSitemapSynthesizer->synthesize(_feedURL, siteTitle);
     435           14 : }
     436              : 
     437           14 : void FeedDiscovery::onNewsSitemapDone()
     438              : {
     439           14 :     if (newsSitemapSynthesizer->hasError()) {
     440           12 :         reportError(newsSitemapSynthesizer->errorString());
     441           12 :         return;
     442              :     }
     443              : 
     444            2 :     RawFeed* synthFeed = newsSitemapSynthesizer->result();
     445            2 :     if (!synthFeed || synthFeed->items.isEmpty()) {
     446            0 :         reportError("No feed found");
     447            0 :         return;
     448              :     }
     449              : 
     450              :     // Set the primary feed result.
     451            2 :     _feedURL = synthFeed->url;
     452            2 :     _feedResult = synthFeed;
     453              : 
     454              :     // Add to discovered feeds list.
     455            2 :     DiscoveredFeed discovered;
     456            2 :     discovered.url = synthFeed->url;
     457            2 :     discovered.feed = synthFeed;
     458            2 :     discovered.title = synthFeed->title;
     459            2 :     discovered.validated = true;
     460            2 :     _discoveredFeeds.clear();
     461            2 :     _discoveredFeeds.append(discovered);
     462              : 
     463            2 :     machine.setState(FEED_FOUND);
     464            2 : }
     465              : 
     466           12 : void FeedDiscovery::reportError(QString errorString)
     467              : {
     468           12 :     _error = true;
     469           12 :     _errorString = errorString;
     470              : 
     471           12 :     machine.setState(FEED_ERROR);
     472           12 : }
        

Generated by: LCOV version 2.0-1