LCOV - code coverage report
Current view: top level - src/utilities - FeedDiscovery.h (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 100.0 % 7 7
Test Date: 2026-03-23 10:19:47 Functions: 100.0 % 7 7

            Line data    Source code
       1              : #ifndef FEEDDISCOVERY_H
       2              : #define FEEDDISCOVERY_H
       3              : 
       4              : #include <QList>
       5              : #include <QObject>
       6              : #include <QString>
       7              : #include <QStringList>
       8              : #include <QTimer>
       9              : #include <QUrl>
      10              : 
      11              : #include <QSimpleStateMachine/QSimpleStateMachine.h>
      12              : 
      13              : #include "../parser/ParserInterface.h"
      14              : #include "../parser/RawFeed.h"
      15              : #include "../parser/BatchNewsParser.h"
      16              : #include "../utilities/WebPageGrabber.h"
      17              : #include "../utilities/GoogleNewsSitemapSynthesizer.h"
      18              : #include "../FangObject.h"
      19              : 
      20              : /*!
      21              :     \brief Attempts to match a user-submitted, URL like "bob.com" to an actual news feed.
      22              :     This is done by massaging the URL so that Qt can fetch a document.  If that document is
      23              :     a web page, we search for a link to the RSS or Atom feed in the HTML.  Then we check
      24              :     that document to see if it's an RSS feed.
      25              : 
      26              :     HTTP and HTML redirects are handled.
      27              : 
      28              :     State machine diagram:
      29              : 
      30              :     CHECK_FEED
      31              :         |  \- ERROR
      32              :         |
      33              :      TRY_FEED
      34              :         |  \- FEED_FOUND
      35              :         |
      36              :     WEB_GRABBER
      37              :         |  \- VALIDATE_FEEDS -> FEED_FOUND
      38              :         |
      39              :     TRY_COMMON_PATHS
      40              :         |  \- FEED_FOUND
      41              :         |
      42              :     TRY_GOOGLE_NEWS_SITEMAP
      43              :         |  \- FEED_FOUND
      44              :         |  \- FEED_ERROR
      45              :  */
      46              : class FeedDiscovery : public FangObject
      47              : {
      48              :     Q_OBJECT
      49              :     
      50              : private:
      51              : 
      52              :     enum FeedDiscoveryState {
      53              :         CHECK_FEED,
      54              :         TRY_FEED,
      55              :         WEB_GRABBER,
      56              :         VALIDATE_FEEDS,      // Bulk feed validation
      57              :         TRY_COMMON_PATHS,    // Probe well-known RSS paths (/feed, /rss, etc.)
      58              :         TRY_GOOGLE_NEWS_SITEMAP, // Google News sitemap-based feed synthesis
      59              :         FEED_FOUND,
      60              :         FEED_ERROR
      61              :     };
      62              :     
      63              : public:
      64              :     /*!
      65              :         \brief Structure to hold a discovered feed with metadata
      66              :      */
      67              :     struct DiscoveredFeed {
      68              :         QUrl url;              // Feed URL
      69              :         QString title;         // Feed title (from parsed feed or URL)
      70              :         QString content;       // Downloaded feed content (for lazy parsing)
      71              :         RawFeed* feed;         // Parsed feed (nullptr if not yet parsed)
      72              :         bool validated;        // Has this feed been successfully parsed?
      73              : 
      74           27 :         DiscoveredFeed() : feed(nullptr), validated(false) {}
      75              :     };
      76              : 
      77              :     explicit FeedDiscovery(QObject *parent = nullptr,
      78              :                           ParserInterface* firstParser = nullptr,
      79              :                           ParserInterface* secondParser = nullptr,
      80              :                           WebPageGrabber* pageGrabber = nullptr,
      81              :                           BatchNewsParser* feedParser = nullptr,
      82              :                           GoogleNewsSitemapSynthesizer* sitemapSynthesizer = nullptr);
      83              :     virtual ~FeedDiscovery();
      84              : 
      85              :     /*!
      86              :         \return After done(), this returns true if there was an error.
      87              :      */
      88           39 :     bool error() { return _error; }
      89              : 
      90              :     /*!
      91              :         \return After done(), this returns the error string, if there was an error.
      92              :      */
      93           14 :     QString errorString() { return _errorString; }
      94              : 
      95              :     /*!
      96              :         \return The feed URL, or an empty URL if there was an error.
      97              :                 For backward compatibility: returns first validated feed.
      98              :      */
      99           24 :     QUrl feedURL() { return _error ? QUrl("") : _feedURL; }
     100              : 
     101              :     /*!
     102              :         \return The raw feed or nullptr if there was an error.
     103              :                 For backward compatibility: returns first validated feed.
     104              :      */
     105            9 :     RawFeed* feedResult() { return _error ? nullptr : _feedResult; }
     106              : 
     107              :     /*!
     108              :         \return List of all discovered feeds (may be empty if error or single-feed mode)
     109              :      */
     110            7 :     QList<DiscoveredFeed> discoveredFeeds() const { return _discoveredFeeds; }
     111              : 
     112              :     /*!
     113              :         \return Number of feeds discovered (0 = error or single-feed mode, 1+ = multi-feed)
     114              :      */
     115            9 :     int feedCount() const { return _discoveredFeeds.count(); }
     116              :     
     117              : signals:
     118              : 
     119              :     /*!
     120              :         \brief Completion signal. Check for error; if false, get feedURL()
     121              :         \param feedDiscovery
     122              :      */
     123              :     void done(FeedDiscovery* feedDiscovery);
     124              :     
     125              : public slots:
     126              : 
     127              :     /*!
     128              :         \brief Call this with a feed URL to check to get started!  Wait for done()
     129              :         \param sURL
     130              :      */
     131              :     virtual void checkFeed(QString sURL);
     132              : 
     133              :     /*!
     134              :         \brief Try to find RSS and Atom feed(s), if available.
     135              :                External use: Intended for use in unit tests.
     136              :         \param document
     137              :      */
     138              :     QList<QString> parseFeedsFromXHTML(const QString& document);
     139              :     
     140              : private slots:
     141              : 
     142              :     // State change slots:
     143              :     void onTryFeed();
     144              :     void onFeedFound();
     145              :     void onWebGrabber();
     146              :     void onValidateFeeds();    // Bulk feed validation
     147              :     void onTryCommonPaths();   // Probe well-known RSS paths
     148              :     void onTryGoogleNewsSitemap(); // Google News sitemap-based feed synthesis
     149              :     void onError();
     150              : 
     151              :     // Parser/BulkParser slots:
     152              :     void onFirstParseDone();
     153              :     void onFeedParserReady();
     154              : 
     155              :     // WebPageGrabber slots:
     156              :     void onPageGrabberReady(WebPageGrabber* grabber, QString* document);
     157              : 
     158              :     // GoogleNewsSitemapSynthesizer slots:
     159              :     void onNewsSitemapDone();
     160              : 
     161              :     // Timeout slot:
     162              :     void onTimeout();
     163              : 
     164              : protected:
     165              :     ParserInterface* parserFirstTry;
     166              :     WebPageGrabber* pageGrabber;            // For fetching HTML pages
     167              :     BatchNewsParser* feedParser;            // For bulk feed parsing
     168              : 
     169              : private:
     170              :     // Common RSS/Atom paths to probe when no feeds are found in HTML.
     171              :     static QStringList commonFeedPaths();
     172              : 
     173              :     // Sets the error flag, error string, and triggers the ERROR state.
     174              :     void reportError(QString errorString);
     175              : 
     176              :     QSimpleStateMachine machine;
     177              : 
     178              :     QUrl _feedURL;
     179              :     bool _error;
     180              :     QString _errorString;
     181              : 
     182              :     RawFeed* _feedResult;
     183              : 
     184              :     // Multi-feed discovery state
     185              :     QList<DiscoveredFeed> _discoveredFeeds;  // All discovered feeds
     186              :     QList<QUrl> _sortedFeedURLs;             // Feed URLs to validate (sorted by path length)
     187              : 
     188              :     // Common path probing state
     189              :     bool _probingCommonPaths;
     190              : 
     191              :     // Sitemap state
     192              :     GoogleNewsSitemapSynthesizer* newsSitemapSynthesizer;
     193              :     QString _pageXHTML;  // Stored XHTML from web grabber for sitemap fallback
     194              : 
     195              :     // Overall discovery timeout
     196              :     QTimer timeoutTimer;
     197              : };
     198              : 
     199              : #endif // FEEDDISCOVERY_H
        

Generated by: LCOV version 2.0-1