Line data Source code
1 : #include "FeedDiscovery.h"
2 : #include "FeedDiscoveryLogging.h"
3 : #include <QSimpleStateMachine/QSimpleStateMachine.h>
4 : #include "PageMetadataExtractor.h"
5 : #include <QXmlStreamReader>
6 : #include <QSet>
7 : #include <algorithm>
8 : #include "WebUtilities.h"
9 : #include "FeedSource.h"
10 : #include "FeedFetchResult.h"
11 : #include "FeedFetcher.h"
12 : #include "BatchFeedFetcher.h"
13 : #include "WebPageGrabber.h"
14 : #include "NewsSitemapSynthesizer.h"
15 :
16 71 : FeedDiscovery::FeedDiscovery(QObject *parent,
17 : FeedSource* firstParser,
18 : WebPageGrabber* pageGrabber,
19 : BatchFeedFetcher* feedParser,
20 71 : NewsSitemapSynthesizer* sitemapSynthesizer) :
21 : QObject(parent),
22 71 : machine(new QSimpleStateMachine(this)),
23 71 : _error(Error::None),
24 71 : _errorString(),
25 71 : _probingCommonPaths(false),
26 213 : newsSitemapSynthesizer(sitemapSynthesizer)
27 : {
28 : // Create default implementations if not provided (with this as parent for auto-cleanup)
29 71 : parserFirstTry = firstParser ? firstParser : new FeedFetcher(this);
30 71 : this->pageGrabber = pageGrabber ? pageGrabber : new WebPageGrabber(this);
31 71 : this->feedParser = feedParser ? feedParser : new BatchFeedFetcher(this);
32 :
33 : // Take ownership of injected dependencies by setting parent
34 71 : if (parserFirstTry && !parserFirstTry->parent()) {
35 30 : parserFirstTry->setParent(this);
36 : }
37 71 : if (this->pageGrabber && !this->pageGrabber->parent()) {
38 30 : this->pageGrabber->setParent(this);
39 : }
40 71 : if (this->feedParser && !this->feedParser->parent()) {
41 30 : this->feedParser->setParent(this);
42 : }
43 71 : if (newsSitemapSynthesizer && !newsSitemapSynthesizer->parent()) {
44 12 : newsSitemapSynthesizer->setParent(this);
45 : }
46 :
47 : // Set up our state machine.
48 110 : machine->addStateChange(CHECK_FEED, TRY_FEED, [this]() { onTryFeed(); });
49 88 : machine->addStateChange(TRY_FEED, FEED_FOUND, [this]() { onFeedFound(); });
50 92 : machine->addStateChange(TRY_FEED, WEB_GRABBER, [this]() { onWebGrabber(); });
51 78 : machine->addStateChange(WEB_GRABBER, VALIDATE_FEEDS, [this]() { onValidateFeeds(); });
52 75 : machine->addStateChange(VALIDATE_FEEDS, FEED_FOUND, [this]() { onFeedFound(); });
53 85 : machine->addStateChange(WEB_GRABBER, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
54 74 : machine->addStateChange(VALIDATE_FEEDS, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
55 74 : machine->addStateChange(TRY_COMMON_PATHS, FEED_FOUND, [this]() { onFeedFound(); });
56 85 : machine->addStateChange(TRY_COMMON_PATHS, TRY_NEWS_SITEMAP, [this]() { onTryNewsSitemap(); });
57 73 : machine->addStateChange(TRY_NEWS_SITEMAP, FEED_FOUND, [this]() { onFeedFound(); });
58 :
59 83 : machine->addStateChange(-1, FEED_ERROR, [this]() { onError(); }); // All errors.
60 :
61 : // Overall discovery timeout.
62 71 : timeoutTimer.setSingleShot(true);
63 71 : timeoutTimer.setInterval(30000);
64 71 : connect(&timeoutTimer, &QTimer::timeout, this, &FeedDiscovery::onTimeout);
65 :
66 : // Parser signals.
67 71 : connect(parserFirstTry, &FeedSource::done, this, &FeedDiscovery::onFirstParseDone);
68 :
69 : // Web page grabber signals.
70 71 : connect(this->pageGrabber, &WebPageGrabber::ready, this, &FeedDiscovery::onPageGrabberReady);
71 71 : connect(this->feedParser, &BatchFeedFetcher::ready, this, &FeedDiscovery::onFeedParserReady);
72 71 : }
73 :
74 71 : FeedDiscovery::~FeedDiscovery()
75 : {
76 : // Qt parent/child hierarchy handles cleanup automatically
77 71 : }
78 :
79 39 : void FeedDiscovery::checkFeed(QString sURL)
80 : {
81 : // Reset state
82 39 : _error = Error::None;
83 39 : _errorString.clear();
84 39 : _probingCommonPaths = false;
85 39 : _discoveredFeeds.clear();
86 39 : _sortedFeedURLs.clear();
87 39 : machine->start(CHECK_FEED);
88 :
89 39 : QUrl url = WebUtilities::urlFixup(sURL);
90 :
91 : // Make sure the location isn't a "relative" (and therefore severely invalid) path.
92 39 : if (url.isRelative() || url.scheme().isEmpty()) {
93 : // Try adjusting the scheme.
94 0 : if (url.scheme() == "") {
95 0 : url.setScheme("http");
96 : }
97 :
98 : //qCDebug(logFeedDiscovery) << "Location is adjusted to: " << location;
99 :
100 : // Final check! If it's not valid, we'll set an error and bail.
101 0 : if (url.isRelative()) {
102 0 : reportError(Error::InvalidURL, "Invalid URL");
103 :
104 0 : return;
105 : }
106 : }
107 :
108 : // Okay, we have a potential URL! Let's check it.
109 39 : _feedURL = url;
110 39 : machine->setState(TRY_FEED);
111 39 : timeoutTimer.start();
112 39 : }
113 :
114 39 : void FeedDiscovery::onTryFeed()
115 : {
116 39 : parserFirstTry->parse(_feedURL);
117 39 : }
118 :
119 26 : void FeedDiscovery::onFeedFound()
120 : {
121 26 : timeoutTimer.stop();
122 26 : Q_ASSERT(_error == Error::None);
123 26 : Q_ASSERT(!_feedURL.isEmpty());
124 :
125 26 : emit done(this);
126 26 : }
127 :
128 21 : void FeedDiscovery::onWebGrabber()
129 : {
130 21 : pageGrabber->load(_feedURL);
131 21 : }
132 :
133 12 : void FeedDiscovery::onError()
134 : {
135 12 : timeoutTimer.stop();
136 12 : Q_ASSERT(_error != Error::None);
137 12 : Q_ASSERT(!_errorString.isEmpty());
138 :
139 12 : emit done(this);
140 12 : }
141 :
142 0 : void FeedDiscovery::onTimeout()
143 : {
144 0 : reportError(Error::Timeout, "Feed discovery timed out");
145 0 : }
146 :
147 38 : void FeedDiscovery::onFirstParseDone()
148 : {
149 38 : FeedFetchResult res = parserFirstTry->getResult();
150 38 : switch (res) {
151 18 : case FeedFetchResult::OK:
152 : {
153 : // User directly entered a feed URL! Add it to discovered feeds
154 18 : _feedURL = parserFirstTry->getURL();
155 18 : auto parsedFeed = parserFirstTry->getFeed();
156 :
157 : // Reject empty feeds - a feed that parses OK but has no items is useless.
158 18 : if (!parsedFeed || parsedFeed->items.isEmpty()) {
159 2 : qCDebug(logFeedDiscovery) << "Feed parsed OK but has no items, trying web grabber";
160 1 : machine->setState(WEB_GRABBER);
161 1 : break;
162 : }
163 :
164 : // Add to discovered feeds list
165 17 : DiscoveredFeed discovered;
166 17 : discovered.url = _feedURL;
167 17 : discovered.feed = parsedFeed;
168 17 : discovered.title = parsedFeed->title.isEmpty() ? _feedURL.toString() : parsedFeed->title;
169 17 : discovered.validated = true;
170 17 : _discoveredFeeds.clear();
171 17 : _discoveredFeeds.append(discovered);
172 :
173 17 : machine->setState(FEED_FOUND);
174 17 : break;
175 18 : }
176 :
177 20 : case FeedFetchResult::NetworkError:
178 : case FeedFetchResult::FileError:
179 : case FeedFetchResult::EmptyDocument:
180 : case FeedFetchResult::ParseError:
181 : // Not a feed, probably HTML. Continue to the web grabber stage.
182 20 : machine->setState(WEB_GRABBER);
183 20 : break;
184 :
185 0 : case FeedFetchResult::InProgress:
186 : default:
187 0 : qCCritical(logFeedDiscovery) << "Unexpected parser result in onFirstParseDone";
188 0 : Q_UNREACHABLE();
189 : // Treat as error and continue to web grabber
190 : machine->setState(WEB_GRABBER);
191 : break;
192 : }
193 38 : }
194 :
195 21 : void FeedDiscovery::onPageGrabberReady(WebPageGrabber* grabber, QString* document)
196 : {
197 : Q_UNUSED(grabber);
198 :
199 : // If we didn't get a document, try common paths before giving up.
200 21 : if (!document || document->isEmpty()) {
201 14 : qCDebug(logFeedDiscovery) << "No page found, trying common paths";
202 7 : machine->setState(TRY_COMMON_PATHS);
203 14 : return;
204 : }
205 :
206 : // Parse feed URLs from the HTML document
207 14 : QList<QString> feedURLs = parseFeedsFromXHTML(*document);
208 28 : qCDebug(logFeedDiscovery) << "Parsed" << feedURLs.count() << "feed URLs from HTML";
209 :
210 14 : if (feedURLs.isEmpty()) {
211 14 : qCDebug(logFeedDiscovery) << "No feeds found in HTML, trying common paths";
212 7 : _pageXHTML = *document;
213 7 : machine->setState(TRY_COMMON_PATHS);
214 7 : return;
215 : }
216 :
217 14 : qCDebug(logFeedDiscovery) << "Total feed URLs found:" << feedURLs.count();
218 :
219 : // Sort by path length (longer paths first = more specific)
220 7 : QList<QString> feedURLStrings = feedURLs;
221 7 : std::sort(feedURLStrings.begin(), feedURLStrings.end(),
222 3 : [](const QString& a, const QString& b) {
223 3 : QUrl urlA(a);
224 3 : QUrl urlB(b);
225 6 : return urlA.path().length() > urlB.path().length();
226 3 : });
227 :
228 : // Convert to QUrl list and store for validation
229 7 : _sortedFeedURLs.clear();
230 16 : for (const QString& urlString : feedURLStrings) {
231 9 : QUrl feedUrl(urlString);
232 :
233 : // Fix relative URLs.
234 9 : if (feedUrl.isRelative()) {
235 2 : feedUrl = _feedURL.resolved(feedUrl);
236 : }
237 9 : _sortedFeedURLs.append(feedUrl);
238 9 : }
239 :
240 : // Trigger bulk feed validation
241 7 : machine->setState(VALIDATE_FEEDS);
242 14 : }
243 :
244 55 : QList<QString> FeedDiscovery::parseFeedsFromXHTML(const QString& document)
245 : {
246 55 : QList<QString> feedsFound;
247 :
248 : // Examples of what we're looking for:
249 : // <link rel="alternate" href="http://www.fark.com/fark.rss" type="application/rss+xml" title="FARK.com Fark RSS Feed">
250 : // <link rel="alternate" type="application/rss+xml" title="MrEricSir.com RSS Feed" href="http://www.mrericsir.com/blog/feed/" />
251 : // <link rel="alternate" type="application/atom+xml" title="MrEricSir.com Atom Feed" href="http://www.mrericsir.com/blog/feed/atom/" />
252 55 : const QString S_REL = "rel";
253 55 : const QString S_HREF = "href";
254 55 : const QString S_TYPE = "type";
255 55 : const QString S_TITLE = "title";
256 55 : const QString S_WORDPRESS_COMMENTS_URL_SUFFIX = "/comments/feed/";
257 :
258 55 : QXmlStreamReader xml;
259 55 : xml.addData(document);
260 :
261 10790 : while (!xml.atEnd()) {
262 : // Grab the next thingie.
263 10734 : xml.readNext();
264 :
265 10734 : if (xml.isStartElement()) {
266 2790 : QString tagName = xml.name().toString().toLower();
267 2790 : if (tagName == "body") {
268 : // We're done with the header, so bail.
269 54 : return feedsFound;
270 : }
271 :
272 2736 : if (tagName == "link") {
273 887 : QXmlStreamAttributes attributes = xml.attributes();
274 :
275 : // Is this a feed?
276 2629 : if (attributes.hasAttribute(S_REL) && attributes.hasAttribute(S_HREF) &&
277 2751 : attributes.value("", S_REL).toString().toLower() == "alternate" &&
278 2821 : attributes.hasAttribute(S_TYPE) &&
279 1024 : (attributes.value("", S_TYPE).toString().toLower() == "application/rss+xml" ||
280 928 : attributes.value("", S_TYPE).toString().toLower() == "application/atom+xml" ||
281 909 : attributes.value("", S_TYPE).toString().toLower() == "application/feed+json")) {
282 : // Run some checks and then add our feed if it seems reasonable to do so.
283 102 : QString url = attributes.value("", S_HREF).toString();
284 :
285 : // Avoid comments feeds as they tend to get added by accident.
286 51 : if (url.endsWith(S_WORDPRESS_COMMENTS_URL_SUFFIX)) {
287 8 : continue;
288 : }
289 :
290 : // Strip trailing slash from feed paths. Some servers (e.g. cbsnews.com)
291 : // return 404 for trailing-slash feed URLs but 200 without.
292 43 : if (url.endsWith("/") && !url.endsWith("://")) {
293 18 : url.chop(1);
294 : }
295 :
296 43 : feedsFound << url;
297 51 : }
298 887 : }
299 2790 : }
300 : }
301 :
302 1 : return feedsFound;
303 55 : }
304 :
305 7 : void FeedDiscovery::onValidateFeeds()
306 : {
307 : // Use the sorted feed URLs from onPageGrabberReady
308 7 : if (_sortedFeedURLs.isEmpty()) {
309 0 : reportError(Error::NoFeedsFound, "No feeds to validate");
310 0 : return;
311 : }
312 :
313 : // Bulk parse all feed URLs
314 7 : feedParser->parse(_sortedFeedURLs);
315 : }
316 :
317 24 : void FeedDiscovery::onFeedParserReady()
318 : {
319 : // Process all parsed feeds
320 24 : _discoveredFeeds.clear();
321 :
322 24 : QMap<QUrl, FeedFetchResult> results = feedParser->getResults();
323 186 : for (auto it = results.constBegin(); it != results.constEnd(); ++it) {
324 162 : QUrl feedURL = it.key();
325 162 : FeedFetchResult result = it.value();
326 :
327 : // Only include successfully parsed feeds that have items.
328 162 : if (result == FeedFetchResult::OK) {
329 10 : auto feed = feedParser->getFeed(feedURL);
330 10 : if (feed && !feed->items.isEmpty()) {
331 8 : DiscoveredFeed discovered;
332 8 : discovered.url = feedURL;
333 8 : discovered.feed = feed;
334 8 : discovered.title = feed->title.isEmpty() ? feedURL.toString() : feed->title;
335 8 : discovered.validated = true;
336 8 : _discoveredFeeds.append(discovered);
337 8 : }
338 10 : }
339 162 : }
340 :
341 : // Check if we found any valid feeds.
342 24 : if (_discoveredFeeds.isEmpty()) {
343 17 : if (_probingCommonPaths) {
344 : // Common paths didn't turn up anything.
345 28 : qCDebug(logFeedDiscovery) << "No valid feeds at common paths, trying sitemap";
346 14 : _probingCommonPaths = false;
347 14 : machine->setState(TRY_NEWS_SITEMAP);
348 : } else {
349 : // Validation of HTML-discovered feeds failed.
350 6 : qCDebug(logFeedDiscovery) << "No valid feeds found, trying common paths";
351 3 : machine->setState(TRY_COMMON_PATHS);
352 : }
353 17 : return;
354 : }
355 :
356 7 : _probingCommonPaths = false;
357 :
358 : // Set the first valid feed as the primary one (for backward compatibility)
359 7 : _feedURL = _discoveredFeeds.first().url;
360 :
361 : // Emit done signal
362 7 : machine->setState(FEED_FOUND);
363 24 : }
364 :
365 17 : QStringList FeedDiscovery::commonFeedPaths()
366 : {
367 : return {
368 : "/feed",
369 : "/rss",
370 : "/feed.xml",
371 : "/feed.json",
372 : "/rss.xml",
373 : "/rss2.0.xml",
374 : "/atom.xml",
375 : "/index.xml",
376 : "/blog/feed"
377 170 : };
378 17 : }
379 :
380 17 : void FeedDiscovery::onTryCommonPaths()
381 : {
382 17 : QUrl rootUrl;
383 17 : rootUrl.setScheme(_feedURL.scheme());
384 17 : rootUrl.setHost(_feedURL.host());
385 17 : if (_feedURL.port() != -1) {
386 0 : rootUrl.setPort(_feedURL.port());
387 : }
388 :
389 17 : QList<QUrl> probeURLs;
390 170 : for (const QString& path : commonFeedPaths()) {
391 153 : QUrl probeUrl = rootUrl;
392 153 : probeUrl.setPath(path);
393 153 : probeURLs.append(probeUrl);
394 170 : }
395 :
396 34 : qCDebug(logFeedDiscovery) << "Probing" << probeURLs.count() << "common feed paths";
397 17 : _probingCommonPaths = true;
398 17 : feedParser->parse(probeURLs);
399 17 : }
400 :
401 14 : void FeedDiscovery::onTryNewsSitemap()
402 : {
403 : // Extract site title from the already-fetched XHTML (if available).
404 : // The page content may be a redirect or error page, so validate the title.
405 14 : QString siteTitle;
406 14 : if (!_pageXHTML.isEmpty()) {
407 5 : PageMetadata meta = PageMetadataExtractor::extract(_pageXHTML);
408 : // Reject titles that look like HTTP status messages.
409 15 : if (!meta.title.isEmpty()
410 10 : && !meta.title.contains("Moved", Qt::CaseInsensitive)
411 10 : && !meta.title.contains("Forbidden", Qt::CaseInsensitive)
412 10 : && !meta.title.contains("Not Found", Qt::CaseInsensitive)
413 10 : && !meta.title.contains("Error", Qt::CaseInsensitive)) {
414 5 : siteTitle = meta.title;
415 : }
416 5 : }
417 14 : if (siteTitle.isEmpty()) {
418 9 : siteTitle = _feedURL.host();
419 : }
420 :
421 28 : qCDebug(logFeedDiscovery) << "FeedDiscovery: trying sitemap for" << _feedURL
422 14 : << "with title" << siteTitle;
423 :
424 14 : if (!newsSitemapSynthesizer) {
425 2 : newsSitemapSynthesizer = new NewsSitemapSynthesizer(this);
426 : }
427 14 : connect(newsSitemapSynthesizer, &NewsSitemapSynthesizer::done,
428 14 : this, &FeedDiscovery::onNewsSitemapDone, Qt::UniqueConnection);
429 14 : newsSitemapSynthesizer->synthesize(_feedURL, siteTitle);
430 14 : }
431 :
432 14 : void FeedDiscovery::onNewsSitemapDone()
433 : {
434 14 : if (newsSitemapSynthesizer->hasError()) {
435 12 : reportError(Error::NoFeedsFound, newsSitemapSynthesizer->errorString());
436 12 : return;
437 : }
438 :
439 2 : auto synthFeed = newsSitemapSynthesizer->result();
440 2 : if (!synthFeed || synthFeed->items.isEmpty()) {
441 0 : reportError(Error::NoFeedsFound, "No feed found");
442 0 : return;
443 : }
444 :
445 : // Set the primary feed result.
446 2 : _feedURL = synthFeed->url;
447 :
448 : // Add to discovered feeds list.
449 2 : DiscoveredFeed discovered;
450 2 : discovered.url = synthFeed->url;
451 2 : discovered.feed = synthFeed;
452 2 : discovered.title = synthFeed->title;
453 2 : discovered.validated = true;
454 2 : _discoveredFeeds.clear();
455 2 : _discoveredFeeds.append(discovered);
456 :
457 2 : machine->setState(FEED_FOUND);
458 2 : }
459 :
460 12 : void FeedDiscovery::reportError(Error error, const QString& errorString)
461 : {
462 12 : _error = error;
463 12 : _errorString = errorString;
464 :
465 12 : machine->setState(FEED_ERROR);
466 12 : }
|