Line data Source code
1 : #include "FeedDiscovery.h"
2 : #include <QXmlStreamReader>
3 : #include <QSet>
4 : #include <algorithm>
5 : #include <QDebug>
6 : #include "NetworkUtilities.h"
7 : #include "ErrorHandling.h"
8 : #include "../parser/NewsParser.h"
9 : #include "../parser/BatchNewsParser.h"
10 : #include "WebPageGrabber.h"
11 :
12 44 : FeedDiscovery::FeedDiscovery(QObject *parent,
13 : ParserInterface* firstParser,
14 : ParserInterface* secondParser,
15 : WebPageGrabber* pageGrabber,
16 44 : BatchNewsParser* feedParser) :
17 : FangObject(parent),
18 44 : machine(),
19 44 : _error(false),
20 44 : _errorString(""),
21 88 : _feedResult(nullptr)
22 : {
23 : // Handle secondParser: no longer used, but we need to clean it up if provided
24 44 : if (secondParser) {
25 17 : if (!secondParser->parent()) {
26 17 : secondParser->setParent(this); // Take ownership so it gets cleaned up
27 : }
28 : }
29 :
30 : // Create default implementations if not provided (with this as parent for auto-cleanup)
31 44 : parserFirstTry = firstParser ? firstParser : new NewsParser(this);
32 44 : this->pageGrabber = pageGrabber ? pageGrabber : new WebPageGrabber(this);
33 44 : this->feedParser = feedParser ? feedParser : new BatchNewsParser(this);
34 :
35 : // Take ownership of injected dependencies by setting parent
36 44 : if (parserFirstTry && !parserFirstTry->parent()) {
37 17 : parserFirstTry->setParent(this);
38 : }
39 44 : if (this->pageGrabber && !this->pageGrabber->parent()) {
40 17 : this->pageGrabber->setParent(this);
41 : }
42 44 : if (this->feedParser && !this->feedParser->parent()) {
43 17 : this->feedParser->setParent(this);
44 : }
45 :
46 : // Set up our state machine.
47 71 : machine.addStateChange(CHECK_FEED, TRY_FEED, [this]() { onTryFeed(); });
48 61 : machine.addStateChange(TRY_FEED, FEED_FOUND, [this]() { onFeedFound(); });
49 53 : machine.addStateChange(TRY_FEED, WEB_GRABBER, [this]() { onWebGrabber(); });
50 46 : machine.addStateChange(WEB_GRABBER, VALIDATE_FEEDS, [this]() { onValidateFeeds(); });
51 45 : machine.addStateChange(VALIDATE_FEEDS, FEED_FOUND, [this]() { onFeedFound(); });
52 :
53 52 : machine.addStateChange(-1, FEED_ERROR, [this]() { onError(); }); // All errors.
54 :
55 : // Parser signals.
56 44 : connect(parserFirstTry, &ParserInterface::done, this, &FeedDiscovery::onFirstParseDone);
57 :
58 : // Web page grabber signals.
59 44 : connect(this->pageGrabber, &WebPageGrabber::ready, this, &FeedDiscovery::onPageGrabberReady);
60 44 : connect(this->feedParser, &BatchNewsParser::ready, this, &FeedDiscovery::onFeedParserReady);
61 44 : }
62 :
63 44 : FeedDiscovery::~FeedDiscovery()
64 : {
65 : // Qt parent/child hierarchy handles cleanup automatically
66 44 : }
67 :
68 27 : void FeedDiscovery::checkFeed(QString sURL)
69 : {
70 : // Reset state
71 27 : _error = false;
72 27 : _errorString = "";
73 27 : _discoveredFeeds.clear();
74 27 : _sortedFeedURLs.clear();
75 27 : machine.start(CHECK_FEED);
76 :
77 27 : QUrl url = NetworkUtilities::urlFixup(sURL);
78 :
79 : // Make sure the location isn't a "relative" (and therefore severely invalid) path.
80 27 : if (url.isRelative() || url.scheme().isEmpty()) {
81 : // Try adjusting the scheme.
82 0 : if (url.scheme() == "") {
83 0 : url.setScheme("http");
84 : }
85 :
86 : //qDebug() << "Location is adjusted to: " << location;
87 :
88 : // Final check! If it's not valid, we'll set an error and bail.
89 0 : if (url.isRelative()) {
90 0 : reportError("Invalid URL");
91 :
92 0 : return;
93 : }
94 : }
95 :
96 : // Okay, we have a potential URL! Let's check it.
97 27 : _feedURL = url;
98 27 : machine.setState(TRY_FEED);
99 27 : }
100 :
101 27 : void FeedDiscovery::onTryFeed()
102 : {
103 27 : parserFirstTry->parse(_feedURL);
104 27 : }
105 :
106 18 : void FeedDiscovery::onFeedFound()
107 : {
108 18 : FANG_CHECK(!_error, "FeedDiscovery::onFeedFound called with _error set");
109 18 : FANG_CHECK(!_feedURL.isEmpty(), "FeedDiscovery::onFeedFound called with empty _feedURL");
110 :
111 18 : emit done(this);
112 18 : }
113 :
114 9 : void FeedDiscovery::onWebGrabber()
115 : {
116 9 : pageGrabber->load(_feedURL);
117 9 : }
118 :
119 8 : void FeedDiscovery::onError()
120 : {
121 8 : FANG_CHECK(_error, "FeedDiscovery::onError called without _error set");
122 8 : FANG_CHECK(!_errorString.isEmpty(), "FeedDiscovery::onError called with empty _errorString");
123 :
124 8 : emit done(this);
125 8 : }
126 :
127 26 : void FeedDiscovery::onFirstParseDone()
128 : {
129 26 : int res = parserFirstTry->getResult();
130 26 : switch (res) {
131 17 : case ParserInterface::OK:
132 : {
133 : // User directly entered a feed URL! Add it to discovered feeds
134 17 : _feedURL = parserFirstTry->getURL();
135 17 : _feedResult = parserFirstTry->getFeed();
136 :
137 : // Add to discovered feeds list
138 17 : DiscoveredFeed discovered;
139 17 : discovered.url = _feedURL;
140 17 : discovered.feed = _feedResult;
141 17 : discovered.title = _feedResult ? _feedResult->title : _feedURL.toString();
142 17 : discovered.validated = true;
143 17 : _discoveredFeeds.clear();
144 17 : _discoveredFeeds.append(discovered);
145 :
146 17 : machine.setState(FEED_FOUND);
147 17 : break;
148 17 : }
149 :
150 9 : case ParserInterface::NETWORK_ERROR:
151 : case ParserInterface::FILE_ERROR:
152 : case ParserInterface::EMPTY_DOCUMENT:
153 : case ParserInterface::PARSE_ERROR:
154 : // Not a feed, probably HTML. Continue to the web grabber stage.
155 9 : machine.setState(WEB_GRABBER);
156 9 : break;
157 :
158 0 : case ParserInterface::IN_PROGRESS:
159 : default:
160 0 : FANG_UNREACHABLE("Unexpected parser result in onFirstParseDone");
161 : // Treat as error and continue to web grabber
162 : machine.setState(WEB_GRABBER);
163 : break;
164 : }
165 26 : }
166 :
167 9 : void FeedDiscovery::onPageGrabberReady(WebPageGrabber* grabber, QString* document)
168 : {
169 : Q_UNUSED(grabber);
170 :
171 : // If we didn't get a document, bail here.
172 9 : if (!document || document->isEmpty()) {
173 6 : reportError("No page found");
174 7 : return;
175 : }
176 :
177 : // Parse feed URLs from the HTML document
178 3 : QList<QString> feedURLs = parseFeedsFromXHTML(*document);
179 3 : qDebug() << "Parsed" << feedURLs.count() << "feed URLs from HTML";
180 :
181 3 : if (feedURLs.isEmpty()) {
182 1 : qDebug() << "No feeds found in HTML!";
183 1 : reportError("No feed found");
184 1 : return;
185 : }
186 :
187 2 : qDebug() << "Total feed URLs found:" << feedURLs.count();
188 :
189 : // Sort by path length (longer paths first = more specific)
190 2 : QList<QString> feedURLStrings = feedURLs;
191 2 : std::sort(feedURLStrings.begin(), feedURLStrings.end(),
192 3 : [](const QString& a, const QString& b) {
193 3 : QUrl urlA(a);
194 3 : QUrl urlB(b);
195 6 : return urlA.path().length() > urlB.path().length();
196 3 : });
197 :
198 : // Convert to QUrl list and store for validation
199 2 : _sortedFeedURLs.clear();
200 6 : for (const QString& urlString : feedURLStrings) {
201 4 : _sortedFeedURLs.append(QUrl(urlString));
202 : }
203 :
204 : // Trigger bulk feed validation
205 2 : machine.setState(VALIDATE_FEEDS);
206 3 : }
207 :
208 29 : QList<QString> FeedDiscovery::parseFeedsFromXHTML(const QString& document)
209 : {
210 29 : QList<QString> feedsFound;
211 :
212 : // Examples of what we're looking for:
213 : // <link rel="alternate" href="http://www.fark.com/fark.rss" type="application/rss+xml" title="FARK.com Fark RSS Feed">
214 : // <link rel="alternate" type="application/rss+xml" title="MrEricSir.com RSS Feed" href="http://www.mrericsir.com/blog/feed/" />
215 : // <link rel="alternate" type="application/atom+xml" title="MrEricSir.com Atom Feed" href="http://www.mrericsir.com/blog/feed/atom/" />
216 29 : const QString S_REL = "rel";
217 29 : const QString S_HREF = "href";
218 29 : const QString S_TYPE = "type";
219 29 : const QString S_TITLE = "title";
220 29 : const QString S_WORDPRESS_COMMENTS_URL_SUFFIX = "/comments/feed/";
221 :
222 29 : QXmlStreamReader xml;
223 29 : xml.addData(document);
224 :
225 6393 : while (!xml.atEnd()) {
226 : // Grab the next thingie.
227 6363 : xml.readNext();
228 :
229 6363 : if (xml.isStartElement()) {
230 1610 : QString tagName = xml.name().toString().toLower();
231 1610 : if (tagName == "body") {
232 : // We're done with the header, so bail.
233 28 : return feedsFound;
234 : }
235 :
236 1582 : if (tagName == "link") {
237 479 : QXmlStreamAttributes attributes = xml.attributes();
238 :
239 : // Is this a feed?
240 1418 : if (attributes.hasAttribute(S_REL) && attributes.hasAttribute(S_HREF) &&
241 1449 : attributes.value("", S_REL).toString().toLower() == "alternate" &&
242 1494 : attributes.hasAttribute(S_TYPE) &&
243 573 : (attributes.value("", S_TYPE).toString().toLower() == "application/rss+xml" ||
244 499 : attributes.value("", S_TYPE).toString().toLower() == "application/atom+xml")) {
245 : // Run some checks and then add our feed if it seems reasonable to do so.
246 70 : QString url = attributes.value("", S_HREF).toString();
247 :
248 : // Avoid comments feeds as they tend to get added by accident.
249 35 : if (url.endsWith(S_WORDPRESS_COMMENTS_URL_SUFFIX)) {
250 7 : continue;
251 : }
252 :
253 28 : feedsFound << url;
254 35 : }
255 479 : }
256 1610 : }
257 : }
258 :
259 1 : return feedsFound;
260 29 : }
261 :
262 2 : void FeedDiscovery::onValidateFeeds()
263 : {
264 : // Use the sorted feed URLs from onPageGrabberReady
265 2 : if (_sortedFeedURLs.isEmpty()) {
266 0 : reportError("No feeds to validate");
267 0 : return;
268 : }
269 :
270 : // Bulk parse all feed URLs
271 2 : feedParser->parse(_sortedFeedURLs);
272 : }
273 :
274 2 : void FeedDiscovery::onFeedParserReady()
275 : {
276 : // Process all parsed feeds
277 2 : _discoveredFeeds.clear();
278 :
279 2 : QMap<QUrl, ParserInterface::ParseResult> results = feedParser->getResults();
280 6 : for (auto it = results.constBegin(); it != results.constEnd(); ++it) {
281 4 : QUrl feedURL = it.key();
282 4 : ParserInterface::ParseResult result = it.value();
283 :
284 : // Only include successfully parsed feeds
285 4 : if (result == ParserInterface::OK) {
286 2 : RawFeed* feed = feedParser->getFeed(feedURL);
287 2 : if (feed) {
288 2 : DiscoveredFeed discovered;
289 2 : discovered.url = feedURL;
290 2 : discovered.feed = feed; // Feed is owned by feedParser
291 2 : discovered.title = feed->title.isEmpty() ? feedURL.toString() : feed->title;
292 2 : discovered.content = ""; // Not storing raw content anymore
293 2 : discovered.validated = true;
294 2 : _discoveredFeeds.append(discovered);
295 2 : }
296 : }
297 4 : }
298 :
299 : // Check if we found any valid feeds
300 2 : if (_discoveredFeeds.isEmpty()) {
301 1 : reportError("No valid feeds found");
302 1 : return;
303 : }
304 :
305 : // Set the first valid feed as the primary one (for backward compatibility)
306 1 : _feedURL = _discoveredFeeds.first().url;
307 1 : _feedResult = _discoveredFeeds.first().feed;
308 :
309 : // Emit done signal
310 1 : machine.setState(FEED_FOUND);
311 2 : }
312 :
313 8 : void FeedDiscovery::reportError(QString errorString)
314 : {
315 8 : _error = true;
316 8 : _errorString = errorString;
317 :
318 8 : machine.setState(FEED_ERROR);
319 8 : }
|