LCOV - code coverage report
Current view: top level - src/utilities - GoogleNewsSitemapSynthesizer.h (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 100.0 % 3 3
Test Date: 2026-03-23 10:19:47 Functions: 100.0 % 3 3

            Line data    Source code
       1              : #ifndef GOOGLENEWSSITEMAPSYNTHESIZER_H
       2              : #define GOOGLENEWSSITEMAPSYNTHESIZER_H
       3              : 
       4              : #include <QDateTime>
       5              : #include <QList>
       6              : #include <QStringList>
       7              : #include <QUrl>
       8              : 
       9              : #include "../FangObject.h"
      10              : #include "../parser/RawFeed.h"
      11              : #include "../parser/SitemapParser.h"
      12              : #include "../network/NetworkDownloadCore.h"
      13              : 
      14              : class GoogleNewsSitemapSynthesizer : public FangObject
      15              : {
      16              :     Q_OBJECT
      17              : public:
      18              :     explicit GoogleNewsSitemapSynthesizer(QObject* parent = nullptr);
      19              :     virtual ~GoogleNewsSitemapSynthesizer();
      20              : 
      21              :     /*!
      22              :         \brief For initial discovery (FeedDiscovery). Probes news sitemap URLs.
      23              :         \param siteUrl The site's base URL (scheme + host).
      24              :         \param siteTitle Title extracted from the homepage.
      25              :      */
      26              :     virtual void synthesize(const QUrl& siteUrl, const QString& siteTitle);
      27              : 
      28              :     /*!
      29              :         \brief For refresh (UpdateFeedOperation). Fetches the stored sitemap URL.
      30              :         \param sitemapUrl The news sitemap URL to fetch.
      31              :         \param feedTitle The feed's stored title.
      32              :         \param since Only include entries newer than this date.
      33              :      */
      34              :     virtual void synthesize(const QUrl& sitemapUrl, const QString& feedTitle,
      35              :                             const QDateTime& since);
      36              : 
      37           14 :     bool hasError() const { return _hasError; }
      38           12 :     QString errorString() const { return _errorString; }
      39            2 :     RawFeed* result() const { return _result; }
      40              : 
      41              : signals:
      42              :     void done();
      43              : 
      44              : private slots:
      45              :     void onRobotsTxtDownloaded(const QUrl& url, const QByteArray& data);
      46              :     void onRobotsTxtDownloadError(const QUrl& url, const QString& errorString);
      47              :     void onCandidateDownloaded(const QUrl& url, const QByteArray& data);
      48              :     void onCandidateDownloadError(const QUrl& url, const QString& errorString);
      49              :     void onSubSitemapDownloaded(const QUrl& url, const QByteArray& data);
      50              :     void onSubSitemapDownloadError(const QUrl& url, const QString& errorString);
      51              : 
      52              : private:
      53              :     static const int MAX_ENTRIES = 30;
      54              : 
      55              :     void reportError(const QString& error);
      56              :     void fetchRobotsTxt();
      57              :     void buildCandidateUrls(const QList<QUrl>& robotsSitemaps);
      58              :     void tryNextCandidate();
      59              :     void tryNextSubSitemap();
      60              :     void processParsedEntries(const QList<SitemapEntry>& entries, const QUrl& sourceUrl);
      61              :     void buildRawFeed();
      62              :     static QString normalizeLanguage(const QString& lang);
      63              : 
      64              : protected:
      65              :     void setResultState(RawFeed* result, bool hasError, const QString& errorString);
      66              : 
      67              : public:
      68              :     // Public for unit testing.
      69              :     static QList<QUrl> parseRobotsSitemaps(const QString& robotsTxt, const QUrl& siteBaseUrl);
      70              :     static QStringList newsSitemapPaths();
      71              :     static QList<SitemapEntry> deduplicateRepetitiveTitles(
      72              :         const QList<SitemapEntry>& entries,
      73              :         int prefixWordCount = 4,
      74              :         int repetitionThreshold = 3);
      75              : 
      76              : private:
      77              : 
      78              :     // Config
      79              :     QUrl siteBaseUrl;
      80              :     QString feedTitle;
      81              :     QDateTime since;
      82              :     bool isRefresh;
      83              : 
      84              :     // State
      85              :     bool _hasError;
      86              :     QString _errorString;
      87              :     RawFeed* _result;
      88              : 
      89              :     // Candidate probing
      90              :     QList<QUrl> candidateUrls;
      91              :     NetworkDownloadCore* downloader;
      92              : 
      93              :     // Sub-sitemap iteration (for sitemapindex responses)
      94              :     QList<SubSitemap> pendingSubSitemaps;
      95              :     QList<SitemapEntry> accumulatedEntries;
      96              :     QUrl sitemapIndexUrl;
      97              : 
      98              :     // Filtered entries ready for feed building
      99              :     QList<SitemapEntry> feedEntries;
     100              :     QUrl feedSourceUrl; // The actual sitemap URL that provided the entries
     101              : };
     102              : 
     103              : #endif // GOOGLENEWSSITEMAPSYNTHESIZER_H
        

Generated by: LCOV version 2.0-1