LCOV - code coverage report
Current view: top level - lib/FangFeedDiscovery - NewsSitemapSynthesizer.h (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 100.0 % 3 3
Test Date: 2026-04-19 00:35:54 Functions: 100.0 % 3 3

            Line data    Source code
       1              : #ifndef NEWSSITEMAPSYNTHESIZER_H
       2              : #define NEWSSITEMAPSYNTHESIZER_H
       3              : 
       4              : #include <memory>
       5              : 
       6              : #include <QDateTime>
       7              : #include <QList>
       8              : #include <QMap>
       9              : #include <QStringList>
      10              : #include <QUrl>
      11              : 
      12              : #include "QBatchWebDownload.h"
      13              : #include "RawFeed.h"
      14              : #include "SitemapParser.h"
      15              : 
      16              : class QWebDownload;
      17              : 
      18              : class NewsSitemapSynthesizer : public QObject
      19              : {
      20              :     Q_OBJECT
      21              : public:
      22              :     explicit NewsSitemapSynthesizer(QObject* parent = nullptr);
      23              :     virtual ~NewsSitemapSynthesizer();
      24              : 
      25              :     /*!
      26              :         \brief For initial discovery (FeedDiscovery). Probes news sitemap URLs.
      27              :         \param siteUrl The site's base URL (scheme + host).
      28              :         \param siteTitle Title extracted from the homepage.
      29              :      */
      30              :     virtual void synthesize(const QUrl& siteUrl, const QString& siteTitle);
      31              : 
      32              :     /*!
      33              :         \brief For refresh (UpdateFeedOperation). Fetches the stored sitemap URL.
      34              :         \param sitemapUrl The news sitemap URL to fetch.
      35              :         \param feedTitle The feed's stored title.
      36              :         \param since Only include entries newer than this date.
      37              :      */
      38              :     virtual void synthesize(const QUrl& sitemapUrl, const QString& feedTitle,
      39              :                             const QDateTime& since);
      40              : 
      41           14 :     bool hasError() const { return _hasError; }
      42           12 :     QString errorString() const { return _errorString; }
      43            2 :     std::shared_ptr<RawFeed> result() const { return _result; }
      44              : 
      45              : signals:
      46              :     void done();
      47              : 
      48              : private slots:
      49              :     void onDownloadFinished(const QUrl& url, const QByteArray& data);
      50              :     void onDownloadError(const QUrl& url, const QString& errorString);
      51              : 
      52              : private:
      53              :     enum State { IDLE, FETCHING_ROBOTS_TXT, FETCHING_CANDIDATE, FETCHING_SUB_SITEMAP,
      54              :                  FETCHING_DESCRIPTIONS };
      55              : 
      56              :     static const int MAX_ENTRIES = 30;
      57              : 
      58              :     void reportError(const QString& error);
      59              :     void fetchRobotsTxt();
      60              :     void buildCandidateUrls(const QList<QUrl>& robotsSitemaps);
      61              :     void tryNextCandidate();
      62              :     void tryNextSubSitemap();
      63              :     void processParsedEntries(const QList<SitemapEntry>& entries, const QUrl& sourceUrl);
      64              :     void buildRawFeed();
      65              :     static QString normalizeLanguage(const QString& lang);
      66              : 
      67              :     void handleRobotsTxtResponse(const QUrl& url, const QByteArray& data);
      68              :     void handleRobotsTxtError(const QUrl& url, const QString& errorString);
      69              :     void handleCandidateResponse(const QUrl& url, const QByteArray& data);
      70              :     void handleCandidateError(const QUrl& url, const QString& errorString);
      71              :     void handleSubSitemapResponse(const QUrl& url, const QByteArray& data);
      72              :     void handleSubSitemapError(const QUrl& url, const QString& errorString);
      73              : 
      74              :     void filterByLanguage(QList<SitemapEntry>& entries);
      75              :     bool filterBySinceDate(QList<SitemapEntry>& entries);
      76              : 
      77              :     void fetchDescriptions();
      78              :     void onDescriptionsReady();
      79              : 
      80              : protected:
      81              :     void setResultState(std::shared_ptr<RawFeed> result, bool hasError, const QString& errorString);
      82              : 
      83              : public:
      84              :     static QList<QUrl> parseRobotsSitemaps(const QString& robotsTxt, const QUrl& siteBaseUrl);
      85              :     static QStringList newsSitemapPaths();
      86              :     static QList<SitemapEntry> deduplicateRepetitiveTitles(
      87              :         const QList<SitemapEntry>& entries,
      88              :         int prefixWordCount = 4,
      89              :         int repetitionThreshold = 3);
      90              : 
      91              : private:
      92              : 
      93              :     // Config
      94              :     QUrl siteBaseUrl;
      95              :     QString feedTitle;
      96              :     QDateTime since;
      97              :     bool isRefresh;
      98              : 
      99              :     // State
     100              :     State state;
     101              :     bool _hasError;
     102              :     QString _errorString;
     103              :     std::shared_ptr<RawFeed> _result;
     104              : 
     105              :     // Candidate probing
     106              :     QList<QUrl> candidateUrls;
     107              :     QWebDownload* downloader;
     108              : 
     109              :     // Sub-sitemap iteration (for sitemapindex responses)
     110              :     QList<SubSitemap> pendingSubSitemaps;
     111              :     QList<SitemapEntry> accumulatedEntries;
     112              :     QUrl sitemapIndexUrl;
     113              : 
     114              :     // Filtered entries ready for feed building
     115              :     QList<SitemapEntry> feedEntries;
     116              :     QUrl feedSourceUrl; // The actual sitemap URL that provided the entries
     117              : 
     118              :     // Description enrichment
     119              :     QBatchWebDownload* descriptionDownloader;
     120              :     QMap<QUrl, QString> fetchedDescriptions;
     121              : };
     122              : 
     123              : #endif // NEWSSITEMAPSYNTHESIZER_H
        

Generated by: LCOV version 2.0-1