Line data Source code
1 : #ifndef FEEDDISCOVERY_H
2 : #define FEEDDISCOVERY_H
3 :
4 : #include <memory>
5 :
6 : #include <QList>
7 : #include <QObject>
8 : #include <QString>
9 : #include <QStringList>
10 : #include <QTimer>
11 : #include <QUrl>
12 :
13 : class QSimpleStateMachine;
14 :
15 : #include "RawFeed.h"
16 :
17 : class FeedSource;
18 : class BatchFeedFetcher;
19 : class WebPageGrabber;
20 : class NewsSitemapSynthesizer;
21 :
22 : /*!
23 : \brief Attempts to match a user-submitted, URL like "bob.com" to an actual news feed.
24 : This is done by massaging the URL so that Qt can fetch a document. If that document is
25 : a web page, we search for a link to the RSS or Atom feed in the HTML. Then we check
26 : that document to see if it's an RSS feed.
27 :
28 : HTTP and HTML redirects are handled.
29 :
30 : State machine diagram:
31 :
32 : CHECK_FEED
33 : | \- ERROR
34 : |
35 : TRY_FEED
36 : | \- FEED_FOUND
37 : |
38 : WEB_GRABBER
39 : | \- VALIDATE_FEEDS -> FEED_FOUND
40 : |
41 : TRY_COMMON_PATHS
42 : | \- FEED_FOUND
43 : |
44 : TRY_NEWS_SITEMAP
45 : | \- FEED_FOUND
46 : | \- FEED_ERROR
47 : */
48 : class FeedDiscovery : public QObject
49 : {
50 : Q_OBJECT
51 :
52 : private:
53 :
54 : enum FeedDiscoveryState {
55 : CHECK_FEED,
56 : TRY_FEED,
57 : WEB_GRABBER,
58 : VALIDATE_FEEDS, // Bulk feed validation
59 : TRY_COMMON_PATHS, // Probe well-known RSS paths (/feed, /rss, etc.)
60 : TRY_NEWS_SITEMAP, // News sitemap-based feed synthesis
61 : FEED_FOUND,
62 : FEED_ERROR
63 : };
64 :
65 : public:
66 : enum class Error {
67 : None,
68 : InvalidURL,
69 : NoFeedsFound,
70 : NetworkError,
71 : Timeout
72 : };
73 :
74 : /*!
75 : \brief Structure to hold a discovered feed with metadata
76 : */
77 : struct DiscoveredFeed {
78 : QUrl url; // Feed URL
79 : QString title; // Feed title (from parsed feed or URL)
80 : std::shared_ptr<RawFeed> feed; // Parsed feed (nullptr if not yet parsed)
81 : bool validated = false; // Has this feed been successfully parsed?
82 : };
83 :
84 : explicit FeedDiscovery(QObject *parent = nullptr,
85 : FeedSource* firstParser = nullptr,
86 : WebPageGrabber* pageGrabber = nullptr,
87 : BatchFeedFetcher* feedParser = nullptr,
88 : NewsSitemapSynthesizer* sitemapSynthesizer = nullptr);
89 : virtual ~FeedDiscovery();
90 :
91 : /*!
92 : \return After done(), this returns the error code (None if successful).
93 : */
94 39 : Error error() { return _error; }
95 :
96 : /*!
97 : \return After done(), this returns the error string, if there was an error.
98 : */
99 14 : QString errorString() { return _errorString; }
100 :
101 : /*!
102 : \return The best feed URL, or an empty URL if there was an error.
103 : */
104 24 : QUrl feedURL() { return _error != Error::None ? QUrl("") : _feedURL; }
105 :
106 : /*!
107 : \return The best raw feed, or nullptr if there was an error.
108 : */
109 9 : std::shared_ptr<RawFeed> feedResult() {
110 9 : if (_error != Error::None || _discoveredFeeds.isEmpty()) {
111 0 : return nullptr;
112 : }
113 9 : return _discoveredFeeds.first().feed;
114 : }
115 :
116 : /*!
117 : \return List of all discovered feeds (may be empty if error or single-feed mode)
118 : */
119 7 : QList<DiscoveredFeed> discoveredFeeds() const { return _discoveredFeeds; }
120 :
121 : /*!
122 : \return Number of feeds discovered (0 = error or single-feed mode, 1+ = multi-feed)
123 : */
124 9 : int feedCount() const { return _discoveredFeeds.count(); }
125 :
126 : signals:
127 :
128 : /*!
129 : \brief Completion signal. Check for error; if false, get feedURL()
130 : \param feedDiscovery
131 : */
132 : void done(FeedDiscovery* feedDiscovery);
133 :
134 : public slots:
135 :
136 : /*!
137 : \brief Call this with a feed URL to check to get started! Wait for done()
138 : \param sURL
139 : */
140 : virtual void checkFeed(QString sURL);
141 :
142 : /*!
143 : \brief Try to find RSS and Atom feed(s), if available.
144 : \param document
145 : */
146 : QList<QString> parseFeedsFromXHTML(const QString& document);
147 :
148 : private slots:
149 :
150 : // State change slots:
151 : void onTryFeed();
152 : void onFeedFound();
153 : void onWebGrabber();
154 : void onValidateFeeds(); // Bulk feed validation
155 : void onTryCommonPaths(); // Probe well-known RSS paths
156 : void onTryNewsSitemap(); // News sitemap-based feed synthesis
157 : void onError();
158 :
159 : // Parser/BulkParser slots:
160 : void onFirstParseDone();
161 : void onFeedParserReady();
162 :
163 : // WebPageGrabber slots:
164 : void onPageGrabberReady(WebPageGrabber* grabber, QString* document);
165 :
166 : // NewsSitemapSynthesizer slots:
167 : void onNewsSitemapDone();
168 :
169 : // Timeout slot:
170 : void onTimeout();
171 :
172 : protected:
173 : FeedSource* parserFirstTry;
174 : WebPageGrabber* pageGrabber; // For fetching HTML pages
175 : BatchFeedFetcher* feedParser; // For bulk feed parsing
176 :
177 : private:
178 : // Common RSS/Atom paths to probe when no feeds are found in HTML.
179 : static QStringList commonFeedPaths();
180 :
181 : // Sets the error code, error string, and triggers the ERROR state.
182 : void reportError(Error error, const QString& errorString = {});
183 :
184 : QSimpleStateMachine* machine;
185 :
186 : QUrl _feedURL;
187 : Error _error;
188 : QString _errorString;
189 :
190 : // Multi-feed discovery state
191 : QList<DiscoveredFeed> _discoveredFeeds; // All discovered feeds
192 : QList<QUrl> _sortedFeedURLs; // Feed URLs to validate (sorted by path length)
193 :
194 : // Common path probing state
195 : bool _probingCommonPaths;
196 :
197 : // Sitemap state
198 : NewsSitemapSynthesizer* newsSitemapSynthesizer;
199 : QString _pageXHTML; // Stored XHTML from web grabber for sitemap fallback
200 :
201 : // Overall discovery timeout
202 : QTimer timeoutTimer;
203 : };
204 :
205 : #endif // FEEDDISCOVERY_H
|