Line data Source code
1 : #ifndef FEEDDISCOVERY_H
2 : #define FEEDDISCOVERY_H
3 :
4 : #include <QList>
5 : #include <QObject>
6 : #include <QString>
7 : #include <QStringList>
8 : #include <QTimer>
9 : #include <QUrl>
10 :
11 : #include <QSimpleStateMachine/QSimpleStateMachine.h>
12 :
13 : #include "../parser/ParserInterface.h"
14 : #include "../parser/RawFeed.h"
15 : #include "../parser/BatchNewsParser.h"
16 : #include "../utilities/WebPageGrabber.h"
17 : #include "../utilities/GoogleNewsSitemapSynthesizer.h"
18 : #include "../FangObject.h"
19 :
20 : /*!
21 : \brief Attempts to match a user-submitted, URL like "bob.com" to an actual news feed.
22 : This is done by massaging the URL so that Qt can fetch a document. If that document is
23 : a web page, we search for a link to the RSS or Atom feed in the HTML. Then we check
24 : that document to see if it's an RSS feed.
25 :
26 : HTTP and HTML redirects are handled.
27 :
28 : State machine diagram:
29 :
30 : CHECK_FEED
31 : | \- ERROR
32 : |
33 : TRY_FEED
34 : | \- FEED_FOUND
35 : |
36 : WEB_GRABBER
37 : | \- VALIDATE_FEEDS -> FEED_FOUND
38 : |
39 : TRY_COMMON_PATHS
40 : | \- FEED_FOUND
41 : |
42 : TRY_GOOGLE_NEWS_SITEMAP
43 : | \- FEED_FOUND
44 : | \- FEED_ERROR
45 : */
46 : class FeedDiscovery : public FangObject
47 : {
48 : Q_OBJECT
49 :
50 : private:
51 :
52 : enum FeedDiscoveryState {
53 : CHECK_FEED,
54 : TRY_FEED,
55 : WEB_GRABBER,
56 : VALIDATE_FEEDS, // Bulk feed validation
57 : TRY_COMMON_PATHS, // Probe well-known RSS paths (/feed, /rss, etc.)
58 : TRY_GOOGLE_NEWS_SITEMAP, // Google News sitemap-based feed synthesis
59 : FEED_FOUND,
60 : FEED_ERROR
61 : };
62 :
63 : public:
64 : /*!
65 : \brief Structure to hold a discovered feed with metadata
66 : */
67 : struct DiscoveredFeed {
68 : QUrl url; // Feed URL
69 : QString title; // Feed title (from parsed feed or URL)
70 : QString content; // Downloaded feed content (for lazy parsing)
71 : RawFeed* feed; // Parsed feed (nullptr if not yet parsed)
72 : bool validated; // Has this feed been successfully parsed?
73 :
74 27 : DiscoveredFeed() : feed(nullptr), validated(false) {}
75 : };
76 :
77 : explicit FeedDiscovery(QObject *parent = nullptr,
78 : ParserInterface* firstParser = nullptr,
79 : ParserInterface* secondParser = nullptr,
80 : WebPageGrabber* pageGrabber = nullptr,
81 : BatchNewsParser* feedParser = nullptr,
82 : GoogleNewsSitemapSynthesizer* sitemapSynthesizer = nullptr);
83 : virtual ~FeedDiscovery();
84 :
85 : /*!
86 : \return After done(), this returns true if there was an error.
87 : */
88 39 : bool error() { return _error; }
89 :
90 : /*!
91 : \return After done(), this returns the error string, if there was an error.
92 : */
93 14 : QString errorString() { return _errorString; }
94 :
95 : /*!
96 : \return The feed URL, or an empty URL if there was an error.
97 : For backward compatibility: returns first validated feed.
98 : */
99 24 : QUrl feedURL() { return _error ? QUrl("") : _feedURL; }
100 :
101 : /*!
102 : \return The raw feed or nullptr if there was an error.
103 : For backward compatibility: returns first validated feed.
104 : */
105 9 : RawFeed* feedResult() { return _error ? nullptr : _feedResult; }
106 :
107 : /*!
108 : \return List of all discovered feeds (may be empty if error or single-feed mode)
109 : */
110 7 : QList<DiscoveredFeed> discoveredFeeds() const { return _discoveredFeeds; }
111 :
112 : /*!
113 : \return Number of feeds discovered (0 = error or single-feed mode, 1+ = multi-feed)
114 : */
115 9 : int feedCount() const { return _discoveredFeeds.count(); }
116 :
117 : signals:
118 :
119 : /*!
120 : \brief Completion signal. Check for error; if false, get feedURL()
121 : \param feedDiscovery
122 : */
123 : void done(FeedDiscovery* feedDiscovery);
124 :
125 : public slots:
126 :
127 : /*!
128 : \brief Call this with a feed URL to check to get started! Wait for done()
129 : \param sURL
130 : */
131 : virtual void checkFeed(QString sURL);
132 :
133 : /*!
134 : \brief Try to find RSS and Atom feed(s), if available.
135 : External use: Intended for use in unit tests.
136 : \param document
137 : */
138 : QList<QString> parseFeedsFromXHTML(const QString& document);
139 :
140 : private slots:
141 :
142 : // State change slots:
143 : void onTryFeed();
144 : void onFeedFound();
145 : void onWebGrabber();
146 : void onValidateFeeds(); // Bulk feed validation
147 : void onTryCommonPaths(); // Probe well-known RSS paths
148 : void onTryGoogleNewsSitemap(); // Google News sitemap-based feed synthesis
149 : void onError();
150 :
151 : // Parser/BulkParser slots:
152 : void onFirstParseDone();
153 : void onFeedParserReady();
154 :
155 : // WebPageGrabber slots:
156 : void onPageGrabberReady(WebPageGrabber* grabber, QString* document);
157 :
158 : // GoogleNewsSitemapSynthesizer slots:
159 : void onNewsSitemapDone();
160 :
161 : // Timeout slot:
162 : void onTimeout();
163 :
164 : protected:
165 : ParserInterface* parserFirstTry;
166 : WebPageGrabber* pageGrabber; // For fetching HTML pages
167 : BatchNewsParser* feedParser; // For bulk feed parsing
168 :
169 : private:
170 : // Common RSS/Atom paths to probe when no feeds are found in HTML.
171 : static QStringList commonFeedPaths();
172 :
173 : // Sets the error flag, error string, and triggers the ERROR state.
174 : void reportError(QString errorString);
175 :
176 : QSimpleStateMachine machine;
177 :
178 : QUrl _feedURL;
179 : bool _error;
180 : QString _errorString;
181 :
182 : RawFeed* _feedResult;
183 :
184 : // Multi-feed discovery state
185 : QList<DiscoveredFeed> _discoveredFeeds; // All discovered feeds
186 : QList<QUrl> _sortedFeedURLs; // Feed URLs to validate (sorted by path length)
187 :
188 : // Common path probing state
189 : bool _probingCommonPaths;
190 :
191 : // Sitemap state
192 : GoogleNewsSitemapSynthesizer* newsSitemapSynthesizer;
193 : QString _pageXHTML; // Stored XHTML from web grabber for sitemap fallback
194 :
195 : // Overall discovery timeout
196 : QTimer timeoutTimer;
197 : };
198 :
199 : #endif // FEEDDISCOVERY_H
|