Line data Source code
1 : #include "FeedDiscovery.h"
2 : #include "FangLogging.h"
3 : #include "PageMetadataExtractor.h"
4 : #include <QXmlStreamReader>
5 : #include <QSet>
6 : #include <algorithm>
7 : #include "NetworkUtilities.h"
8 : #include "ErrorHandling.h"
9 : #include "../parser/NewsParser.h"
10 : #include "../parser/BatchNewsParser.h"
11 : #include "WebPageGrabber.h"
12 :
13 70 : FeedDiscovery::FeedDiscovery(QObject *parent,
14 : ParserInterface* firstParser,
15 : ParserInterface* secondParser,
16 : WebPageGrabber* pageGrabber,
17 : BatchNewsParser* feedParser,
18 70 : GoogleNewsSitemapSynthesizer* sitemapSynthesizer) :
19 : FangObject(parent),
20 70 : machine(),
21 70 : _error(false),
22 70 : _errorString(""),
23 70 : _feedResult(nullptr),
24 70 : _probingCommonPaths(false),
25 140 : newsSitemapSynthesizer(sitemapSynthesizer)
26 : {
27 : // Handle secondParser: no longer used, but we need to clean it up if provided
28 70 : if (secondParser) {
29 30 : if (!secondParser->parent()) {
30 30 : secondParser->setParent(this); // Take ownership so it gets cleaned up
31 : }
32 : }
33 :
34 : // Create default implementations if not provided (with this as parent for auto-cleanup)
35 70 : parserFirstTry = firstParser ? firstParser : new NewsParser(this);
36 70 : this->pageGrabber = pageGrabber ? pageGrabber : new WebPageGrabber(this);
37 70 : this->feedParser = feedParser ? feedParser : new BatchNewsParser(this);
38 :
39 : // Take ownership of injected dependencies by setting parent
40 70 : if (parserFirstTry && !parserFirstTry->parent()) {
41 30 : parserFirstTry->setParent(this);
42 : }
43 70 : if (this->pageGrabber && !this->pageGrabber->parent()) {
44 30 : this->pageGrabber->setParent(this);
45 : }
46 70 : if (this->feedParser && !this->feedParser->parent()) {
47 30 : this->feedParser->setParent(this);
48 : }
49 70 : if (newsSitemapSynthesizer && !newsSitemapSynthesizer->parent()) {
50 12 : newsSitemapSynthesizer->setParent(this);
51 : }
52 :
53 : // Set up our state machine.
54 109 : machine.addStateChange(CHECK_FEED, TRY_FEED, [this]() { onTryFeed(); });
55 87 : machine.addStateChange(TRY_FEED, FEED_FOUND, [this]() { onFeedFound(); });
56 91 : machine.addStateChange(TRY_FEED, WEB_GRABBER, [this]() { onWebGrabber(); });
57 77 : machine.addStateChange(WEB_GRABBER, VALIDATE_FEEDS, [this]() { onValidateFeeds(); });
58 74 : machine.addStateChange(VALIDATE_FEEDS, FEED_FOUND, [this]() { onFeedFound(); });
59 84 : machine.addStateChange(WEB_GRABBER, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
60 73 : machine.addStateChange(VALIDATE_FEEDS, TRY_COMMON_PATHS, [this]() { onTryCommonPaths(); });
61 73 : machine.addStateChange(TRY_COMMON_PATHS, FEED_FOUND, [this]() { onFeedFound(); });
62 84 : machine.addStateChange(TRY_COMMON_PATHS, TRY_GOOGLE_NEWS_SITEMAP, [this]() { onTryGoogleNewsSitemap(); });
63 72 : machine.addStateChange(TRY_GOOGLE_NEWS_SITEMAP, FEED_FOUND, [this]() { onFeedFound(); });
64 :
65 82 : machine.addStateChange(-1, FEED_ERROR, [this]() { onError(); }); // All errors.
66 :
67 : // Overall discovery timeout.
68 70 : timeoutTimer.setSingleShot(true);
69 70 : timeoutTimer.setInterval(30000);
70 70 : connect(&timeoutTimer, &QTimer::timeout, this, &FeedDiscovery::onTimeout);
71 :
72 : // Parser signals.
73 70 : connect(parserFirstTry, &ParserInterface::done, this, &FeedDiscovery::onFirstParseDone);
74 :
75 : // Web page grabber signals.
76 70 : connect(this->pageGrabber, &WebPageGrabber::ready, this, &FeedDiscovery::onPageGrabberReady);
77 70 : connect(this->feedParser, &BatchNewsParser::ready, this, &FeedDiscovery::onFeedParserReady);
78 70 : }
79 :
80 70 : FeedDiscovery::~FeedDiscovery()
81 : {
82 : // Qt parent/child hierarchy handles cleanup automatically
83 70 : }
84 :
85 39 : void FeedDiscovery::checkFeed(QString sURL)
86 : {
87 : // Reset state
88 39 : _error = false;
89 39 : _errorString = "";
90 39 : _probingCommonPaths = false;
91 39 : _discoveredFeeds.clear();
92 39 : _sortedFeedURLs.clear();
93 39 : machine.start(CHECK_FEED);
94 :
95 39 : QUrl url = NetworkUtilities::urlFixup(sURL);
96 :
97 : // Make sure the location isn't a "relative" (and therefore severely invalid) path.
98 39 : if (url.isRelative() || url.scheme().isEmpty()) {
99 : // Try adjusting the scheme.
100 0 : if (url.scheme() == "") {
101 0 : url.setScheme("http");
102 : }
103 :
104 : //qCDebug(logUtility) << "Location is adjusted to: " << location;
105 :
106 : // Final check! If it's not valid, we'll set an error and bail.
107 0 : if (url.isRelative()) {
108 0 : reportError("Invalid URL");
109 :
110 0 : return;
111 : }
112 : }
113 :
114 : // Okay, we have a potential URL! Let's check it.
115 39 : _feedURL = url;
116 39 : machine.setState(TRY_FEED);
117 39 : timeoutTimer.start();
118 39 : }
119 :
120 39 : void FeedDiscovery::onTryFeed()
121 : {
122 39 : parserFirstTry->parse(_feedURL);
123 39 : }
124 :
125 26 : void FeedDiscovery::onFeedFound()
126 : {
127 26 : timeoutTimer.stop();
128 26 : FANG_CHECK(!_error, "FeedDiscovery::onFeedFound called with _error set");
129 26 : FANG_CHECK(!_feedURL.isEmpty(), "FeedDiscovery::onFeedFound called with empty _feedURL");
130 :
131 26 : emit done(this);
132 26 : }
133 :
134 21 : void FeedDiscovery::onWebGrabber()
135 : {
136 21 : pageGrabber->load(_feedURL);
137 21 : }
138 :
139 12 : void FeedDiscovery::onError()
140 : {
141 12 : timeoutTimer.stop();
142 12 : FANG_CHECK(_error, "FeedDiscovery::onError called without _error set");
143 12 : FANG_CHECK(!_errorString.isEmpty(), "FeedDiscovery::onError called with empty _errorString");
144 :
145 12 : emit done(this);
146 12 : }
147 :
148 0 : void FeedDiscovery::onTimeout()
149 : {
150 0 : reportError("Feed discovery timed out");
151 0 : }
152 :
153 38 : void FeedDiscovery::onFirstParseDone()
154 : {
155 38 : int res = parserFirstTry->getResult();
156 38 : switch (res) {
157 18 : case ParserInterface::OK:
158 : {
159 : // User directly entered a feed URL! Add it to discovered feeds
160 18 : _feedURL = parserFirstTry->getURL();
161 18 : _feedResult = parserFirstTry->getFeed();
162 :
163 : // Reject empty feeds - a feed that parses OK but has no items is useless.
164 18 : if (!_feedResult || _feedResult->items.isEmpty()) {
165 2 : qCDebug(logUtility) << "Feed parsed OK but has no items, trying web grabber";
166 1 : machine.setState(WEB_GRABBER);
167 1 : break;
168 : }
169 :
170 : // Add to discovered feeds list
171 17 : DiscoveredFeed discovered;
172 17 : discovered.url = _feedURL;
173 17 : discovered.feed = _feedResult;
174 17 : discovered.title = _feedResult->title.isEmpty() ? _feedURL.toString() : _feedResult->title;
175 17 : discovered.validated = true;
176 17 : _discoveredFeeds.clear();
177 17 : _discoveredFeeds.append(discovered);
178 :
179 17 : machine.setState(FEED_FOUND);
180 17 : break;
181 17 : }
182 :
183 20 : case ParserInterface::NETWORK_ERROR:
184 : case ParserInterface::FILE_ERROR:
185 : case ParserInterface::EMPTY_DOCUMENT:
186 : case ParserInterface::PARSE_ERROR:
187 : // Not a feed, probably HTML. Continue to the web grabber stage.
188 20 : machine.setState(WEB_GRABBER);
189 20 : break;
190 :
191 0 : case ParserInterface::IN_PROGRESS:
192 : default:
193 0 : FANG_UNREACHABLE("Unexpected parser result in onFirstParseDone");
194 : // Treat as error and continue to web grabber
195 : machine.setState(WEB_GRABBER);
196 : break;
197 : }
198 38 : }
199 :
200 21 : void FeedDiscovery::onPageGrabberReady(WebPageGrabber* grabber, QString* document)
201 : {
202 : Q_UNUSED(grabber);
203 :
204 : // If we didn't get a document, try common paths before giving up.
205 21 : if (!document || document->isEmpty()) {
206 14 : qCDebug(logUtility) << "No page found, trying common paths";
207 7 : machine.setState(TRY_COMMON_PATHS);
208 14 : return;
209 : }
210 :
211 : // Parse feed URLs from the HTML document
212 14 : QList<QString> feedURLs = parseFeedsFromXHTML(*document);
213 28 : qCDebug(logUtility) << "Parsed" << feedURLs.count() << "feed URLs from HTML";
214 :
215 14 : if (feedURLs.isEmpty()) {
216 14 : qCDebug(logUtility) << "No feeds found in HTML, trying common paths";
217 7 : _pageXHTML = *document;
218 7 : machine.setState(TRY_COMMON_PATHS);
219 7 : return;
220 : }
221 :
222 14 : qCDebug(logUtility) << "Total feed URLs found:" << feedURLs.count();
223 :
224 : // Sort by path length (longer paths first = more specific)
225 7 : QList<QString> feedURLStrings = feedURLs;
226 7 : std::sort(feedURLStrings.begin(), feedURLStrings.end(),
227 3 : [](const QString& a, const QString& b) {
228 3 : QUrl urlA(a);
229 3 : QUrl urlB(b);
230 6 : return urlA.path().length() > urlB.path().length();
231 3 : });
232 :
233 : // Convert to QUrl list and store for validation
234 7 : _sortedFeedURLs.clear();
235 16 : for (const QString& urlString : feedURLStrings) {
236 9 : QUrl feedUrl(urlString);
237 :
238 : // Fix relative URLs.
239 9 : if (feedUrl.isRelative()) {
240 2 : feedUrl = _feedURL.resolved(feedUrl);
241 : }
242 9 : _sortedFeedURLs.append(feedUrl);
243 9 : }
244 :
245 : // Trigger bulk feed validation
246 7 : machine.setState(VALIDATE_FEEDS);
247 14 : }
248 :
249 54 : QList<QString> FeedDiscovery::parseFeedsFromXHTML(const QString& document)
250 : {
251 54 : QList<QString> feedsFound;
252 :
253 : // Examples of what we're looking for:
254 : // <link rel="alternate" href="http://www.fark.com/fark.rss" type="application/rss+xml" title="FARK.com Fark RSS Feed">
255 : // <link rel="alternate" type="application/rss+xml" title="MrEricSir.com RSS Feed" href="http://www.mrericsir.com/blog/feed/" />
256 : // <link rel="alternate" type="application/atom+xml" title="MrEricSir.com Atom Feed" href="http://www.mrericsir.com/blog/feed/atom/" />
257 54 : const QString S_REL = "rel";
258 54 : const QString S_HREF = "href";
259 54 : const QString S_TYPE = "type";
260 54 : const QString S_TITLE = "title";
261 54 : const QString S_WORDPRESS_COMMENTS_URL_SUFFIX = "/comments/feed/";
262 :
263 54 : QXmlStreamReader xml;
264 54 : xml.addData(document);
265 :
266 10767 : while (!xml.atEnd()) {
267 : // Grab the next thingie.
268 10712 : xml.readNext();
269 :
270 10712 : if (xml.isStartElement()) {
271 2783 : QString tagName = xml.name().toString().toLower();
272 2783 : if (tagName == "body") {
273 : // We're done with the header, so bail.
274 53 : return feedsFound;
275 : }
276 :
277 2730 : if (tagName == "link") {
278 885 : QXmlStreamAttributes attributes = xml.attributes();
279 :
280 : // Is this a feed?
281 2623 : if (attributes.hasAttribute(S_REL) && attributes.hasAttribute(S_HREF) &&
282 2743 : attributes.value("", S_REL).toString().toLower() == "alternate" &&
283 2812 : attributes.hasAttribute(S_TYPE) &&
284 1017 : (attributes.value("", S_TYPE).toString().toLower() == "application/rss+xml" ||
285 913 : attributes.value("", S_TYPE).toString().toLower() == "application/atom+xml")) {
286 : // Run some checks and then add our feed if it seems reasonable to do so.
287 98 : QString url = attributes.value("", S_HREF).toString();
288 :
289 : // Avoid comments feeds as they tend to get added by accident.
290 49 : if (url.endsWith(S_WORDPRESS_COMMENTS_URL_SUFFIX)) {
291 8 : continue;
292 : }
293 :
294 : // Strip trailing slash from feed paths. Some servers (e.g. cbsnews.com)
295 : // return 404 for trailing-slash feed URLs but 200 without.
296 41 : if (url.endsWith("/") && !url.endsWith("://")) {
297 18 : url.chop(1);
298 : }
299 :
300 41 : feedsFound << url;
301 49 : }
302 885 : }
303 2783 : }
304 : }
305 :
306 1 : return feedsFound;
307 54 : }
308 :
309 7 : void FeedDiscovery::onValidateFeeds()
310 : {
311 : // Use the sorted feed URLs from onPageGrabberReady
312 7 : if (_sortedFeedURLs.isEmpty()) {
313 0 : reportError("No feeds to validate");
314 0 : return;
315 : }
316 :
317 : // Bulk parse all feed URLs
318 7 : feedParser->parse(_sortedFeedURLs);
319 : }
320 :
321 24 : void FeedDiscovery::onFeedParserReady()
322 : {
323 : // Process all parsed feeds
324 24 : _discoveredFeeds.clear();
325 :
326 24 : QMap<QUrl, ParserInterface::ParseResult> results = feedParser->getResults();
327 169 : for (auto it = results.constBegin(); it != results.constEnd(); ++it) {
328 145 : QUrl feedURL = it.key();
329 145 : ParserInterface::ParseResult result = it.value();
330 :
331 : // Only include successfully parsed feeds that have items.
332 145 : if (result == ParserInterface::OK) {
333 10 : RawFeed* feed = feedParser->getFeed(feedURL);
334 10 : if (feed && !feed->items.isEmpty()) {
335 8 : DiscoveredFeed discovered;
336 8 : discovered.url = feedURL;
337 8 : discovered.feed = feed; // Feed is owned by feedParser
338 8 : discovered.title = feed->title.isEmpty() ? feedURL.toString() : feed->title;
339 8 : discovered.content = ""; // Not storing raw content anymore
340 8 : discovered.validated = true;
341 8 : _discoveredFeeds.append(discovered);
342 8 : }
343 : }
344 145 : }
345 :
346 : // Check if we found any valid feeds.
347 24 : if (_discoveredFeeds.isEmpty()) {
348 17 : if (_probingCommonPaths) {
349 : // Common paths didn't turn up anything.
350 28 : qCDebug(logUtility) << "No valid feeds at common paths, trying sitemap";
351 14 : _probingCommonPaths = false;
352 14 : machine.setState(TRY_GOOGLE_NEWS_SITEMAP);
353 : } else {
354 : // Validation of HTML-discovered feeds failed.
355 6 : qCDebug(logUtility) << "No valid feeds found, trying common paths";
356 3 : machine.setState(TRY_COMMON_PATHS);
357 : }
358 17 : return;
359 : }
360 :
361 7 : _probingCommonPaths = false;
362 :
363 : // Set the first valid feed as the primary one (for backward compatibility)
364 7 : _feedURL = _discoveredFeeds.first().url;
365 7 : _feedResult = _discoveredFeeds.first().feed;
366 :
367 : // Emit done signal
368 7 : machine.setState(FEED_FOUND);
369 24 : }
370 :
371 17 : QStringList FeedDiscovery::commonFeedPaths()
372 : {
373 : return {
374 : "/feed",
375 : "/rss",
376 : "/feed.xml",
377 : "/rss.xml",
378 : "/rss2.0.xml",
379 : "/atom.xml",
380 : "/index.xml",
381 : "/blog/feed"
382 153 : };
383 17 : }
384 :
385 17 : void FeedDiscovery::onTryCommonPaths()
386 : {
387 17 : QUrl rootUrl;
388 17 : rootUrl.setScheme(_feedURL.scheme());
389 17 : rootUrl.setHost(_feedURL.host());
390 17 : if (_feedURL.port() != -1) {
391 0 : rootUrl.setPort(_feedURL.port());
392 : }
393 :
394 17 : QList<QUrl> probeURLs;
395 153 : for (const QString& path : commonFeedPaths()) {
396 136 : QUrl probeUrl = rootUrl;
397 136 : probeUrl.setPath(path);
398 136 : probeURLs.append(probeUrl);
399 153 : }
400 :
401 34 : qCDebug(logUtility) << "Probing" << probeURLs.count() << "common feed paths";
402 17 : _probingCommonPaths = true;
403 17 : feedParser->parse(probeURLs);
404 17 : }
405 :
406 14 : void FeedDiscovery::onTryGoogleNewsSitemap()
407 : {
408 : // Extract site title from the already-fetched XHTML (if available).
409 : // The page content may be a redirect or error page, so validate the title.
410 14 : QString siteTitle;
411 14 : if (!_pageXHTML.isEmpty()) {
412 5 : PageMetadata meta = PageMetadataExtractor::extract(_pageXHTML);
413 : // Reject titles that look like HTTP status messages.
414 15 : if (!meta.title.isEmpty()
415 10 : && !meta.title.contains("Moved", Qt::CaseInsensitive)
416 10 : && !meta.title.contains("Forbidden", Qt::CaseInsensitive)
417 10 : && !meta.title.contains("Not Found", Qt::CaseInsensitive)
418 10 : && !meta.title.contains("Error", Qt::CaseInsensitive)) {
419 5 : siteTitle = meta.title;
420 : }
421 5 : }
422 14 : if (siteTitle.isEmpty()) {
423 9 : siteTitle = _feedURL.host();
424 : }
425 :
426 28 : qCDebug(logUtility) << "FeedDiscovery: trying sitemap for" << _feedURL
427 14 : << "with title" << siteTitle;
428 :
429 14 : if (!newsSitemapSynthesizer) {
430 2 : newsSitemapSynthesizer = new GoogleNewsSitemapSynthesizer(this);
431 : }
432 14 : connect(newsSitemapSynthesizer, &GoogleNewsSitemapSynthesizer::done,
433 14 : this, &FeedDiscovery::onNewsSitemapDone, Qt::UniqueConnection);
434 14 : newsSitemapSynthesizer->synthesize(_feedURL, siteTitle);
435 14 : }
436 :
437 14 : void FeedDiscovery::onNewsSitemapDone()
438 : {
439 14 : if (newsSitemapSynthesizer->hasError()) {
440 12 : reportError(newsSitemapSynthesizer->errorString());
441 12 : return;
442 : }
443 :
444 2 : RawFeed* synthFeed = newsSitemapSynthesizer->result();
445 2 : if (!synthFeed || synthFeed->items.isEmpty()) {
446 0 : reportError("No feed found");
447 0 : return;
448 : }
449 :
450 : // Set the primary feed result.
451 2 : _feedURL = synthFeed->url;
452 2 : _feedResult = synthFeed;
453 :
454 : // Add to discovered feeds list.
455 2 : DiscoveredFeed discovered;
456 2 : discovered.url = synthFeed->url;
457 2 : discovered.feed = synthFeed;
458 2 : discovered.title = synthFeed->title;
459 2 : discovered.validated = true;
460 2 : _discoveredFeeds.clear();
461 2 : _discoveredFeeds.append(discovered);
462 :
463 2 : machine.setState(FEED_FOUND);
464 2 : }
465 :
466 12 : void FeedDiscovery::reportError(QString errorString)
467 : {
468 12 : _error = true;
469 12 : _errorString = errorString;
470 :
471 12 : machine.setState(FEED_ERROR);
472 12 : }
|