Line data Source code
1 : #ifndef GOOGLENEWSSITEMAPSYNTHESIZER_H
2 : #define GOOGLENEWSSITEMAPSYNTHESIZER_H
3 :
4 : #include <QDateTime>
5 : #include <QList>
6 : #include <QStringList>
7 : #include <QUrl>
8 :
9 : #include "../FangObject.h"
10 : #include "../parser/RawFeed.h"
11 : #include "../parser/SitemapParser.h"
12 : #include "../network/NetworkDownloadCore.h"
13 :
14 : class GoogleNewsSitemapSynthesizer : public FangObject
15 : {
16 : Q_OBJECT
17 : public:
18 : explicit GoogleNewsSitemapSynthesizer(QObject* parent = nullptr);
19 : virtual ~GoogleNewsSitemapSynthesizer();
20 :
21 : /*!
22 : \brief For initial discovery (FeedDiscovery). Probes news sitemap URLs.
23 : \param siteUrl The site's base URL (scheme + host).
24 : \param siteTitle Title extracted from the homepage.
25 : */
26 : virtual void synthesize(const QUrl& siteUrl, const QString& siteTitle);
27 :
28 : /*!
29 : \brief For refresh (UpdateFeedOperation). Fetches the stored sitemap URL.
30 : \param sitemapUrl The news sitemap URL to fetch.
31 : \param feedTitle The feed's stored title.
32 : \param since Only include entries newer than this date.
33 : */
34 : virtual void synthesize(const QUrl& sitemapUrl, const QString& feedTitle,
35 : const QDateTime& since);
36 :
37 14 : bool hasError() const { return _hasError; }
38 12 : QString errorString() const { return _errorString; }
39 2 : RawFeed* result() const { return _result; }
40 :
41 : signals:
42 : void done();
43 :
44 : private slots:
45 : void onRobotsTxtDownloaded(const QUrl& url, const QByteArray& data);
46 : void onRobotsTxtDownloadError(const QUrl& url, const QString& errorString);
47 : void onCandidateDownloaded(const QUrl& url, const QByteArray& data);
48 : void onCandidateDownloadError(const QUrl& url, const QString& errorString);
49 : void onSubSitemapDownloaded(const QUrl& url, const QByteArray& data);
50 : void onSubSitemapDownloadError(const QUrl& url, const QString& errorString);
51 :
52 : private:
53 : static const int MAX_ENTRIES = 30;
54 :
55 : void reportError(const QString& error);
56 : void fetchRobotsTxt();
57 : void buildCandidateUrls(const QList<QUrl>& robotsSitemaps);
58 : void tryNextCandidate();
59 : void tryNextSubSitemap();
60 : void processParsedEntries(const QList<SitemapEntry>& entries, const QUrl& sourceUrl);
61 : void buildRawFeed();
62 : static QString normalizeLanguage(const QString& lang);
63 :
64 : protected:
65 : void setResultState(RawFeed* result, bool hasError, const QString& errorString);
66 :
67 : public:
68 : // Public for unit testing.
69 : static QList<QUrl> parseRobotsSitemaps(const QString& robotsTxt, const QUrl& siteBaseUrl);
70 : static QStringList newsSitemapPaths();
71 : static QList<SitemapEntry> deduplicateRepetitiveTitles(
72 : const QList<SitemapEntry>& entries,
73 : int prefixWordCount = 4,
74 : int repetitionThreshold = 3);
75 :
76 : private:
77 :
78 : // Config
79 : QUrl siteBaseUrl;
80 : QString feedTitle;
81 : QDateTime since;
82 : bool isRefresh;
83 :
84 : // State
85 : bool _hasError;
86 : QString _errorString;
87 : RawFeed* _result;
88 :
89 : // Candidate probing
90 : QList<QUrl> candidateUrls;
91 : NetworkDownloadCore* downloader;
92 :
93 : // Sub-sitemap iteration (for sitemapindex responses)
94 : QList<SubSitemap> pendingSubSitemaps;
95 : QList<SitemapEntry> accumulatedEntries;
96 : QUrl sitemapIndexUrl;
97 :
98 : // Filtered entries ready for feed building
99 : QList<SitemapEntry> feedEntries;
100 : QUrl feedSourceUrl; // The actual sitemap URL that provided the entries
101 : };
102 :
103 : #endif // GOOGLENEWSSITEMAPSYNTHESIZER_H
|