Line data Source code
1 : #include "GoogleNewsSitemapSynthesizer.h"
2 : #include "FangLogging.h"
3 :
4 : #include <algorithm>
5 : #include <QSet>
6 :
7 66 : static QString stripWww(const QString& host)
8 : {
9 66 : if (host.startsWith("www.")) {
10 56 : return host.mid(4);
11 : }
12 10 : return host;
13 : }
14 :
15 14 : GoogleNewsSitemapSynthesizer::GoogleNewsSitemapSynthesizer(QObject* parent)
16 : : FangObject(parent)
17 14 : , isRefresh(false)
18 14 : , _hasError(false)
19 14 : , _result(nullptr)
20 14 : , downloader(nullptr)
21 : {
22 14 : }
23 :
24 16 : GoogleNewsSitemapSynthesizer::~GoogleNewsSitemapSynthesizer()
25 : {
26 16 : }
27 :
28 3 : QStringList GoogleNewsSitemapSynthesizer::newsSitemapPaths()
29 : {
30 : return {
31 : "/news-sitemap.xml",
32 : "/sitemap_news.xml",
33 : "/news-sitemap-content.xml"
34 12 : };
35 3 : }
36 :
37 3 : void GoogleNewsSitemapSynthesizer::synthesize(const QUrl& siteUrl, const QString& siteTitle)
38 : {
39 3 : isRefresh = false;
40 3 : feedTitle = siteTitle;
41 3 : since = QDateTime();
42 :
43 : // Build base URL for probing.
44 3 : siteBaseUrl.setScheme(siteUrl.scheme());
45 3 : siteBaseUrl.setHost(siteUrl.host());
46 :
47 6 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: starting discovery for" << siteBaseUrl;
48 :
49 : // Start by fetching robots.txt to discover news sitemap URLs.
50 3 : fetchRobotsTxt();
51 3 : }
52 :
53 0 : void GoogleNewsSitemapSynthesizer::synthesize(const QUrl& sitemapUrl, const QString& feedTitle,
54 : const QDateTime& since)
55 : {
56 0 : isRefresh = true;
57 0 : this->feedTitle = feedTitle;
58 0 : this->since = since;
59 :
60 : // For refresh, we already know the exact sitemap URL.
61 0 : candidateUrls.clear();
62 0 : candidateUrls.append(sitemapUrl);
63 :
64 0 : siteBaseUrl.setScheme(sitemapUrl.scheme());
65 0 : siteBaseUrl.setHost(sitemapUrl.host());
66 :
67 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: refreshing from" << sitemapUrl
68 0 : << "since" << since;
69 :
70 0 : tryNextCandidate();
71 0 : }
72 :
73 3 : void GoogleNewsSitemapSynthesizer::fetchRobotsTxt()
74 : {
75 3 : QUrl robotsUrl = siteBaseUrl;
76 3 : robotsUrl.setPath("/robots.txt");
77 :
78 6 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: fetching" << robotsUrl;
79 :
80 3 : downloader = new NetworkDownloadCore({}, this, nullptr);
81 3 : connect(downloader, &NetworkDownloadCore::finished,
82 3 : this, &GoogleNewsSitemapSynthesizer::onRobotsTxtDownloaded);
83 3 : connect(downloader, &NetworkDownloadCore::error,
84 3 : this, &GoogleNewsSitemapSynthesizer::onRobotsTxtDownloadError);
85 3 : downloader->download(robotsUrl);
86 3 : }
87 :
88 11 : QList<QUrl> GoogleNewsSitemapSynthesizer::parseRobotsSitemaps(const QString& robotsTxt,
89 : const QUrl& siteBaseUrl)
90 : {
91 11 : QList<QUrl> newsSitemaps;
92 11 : QList<QUrl> genericSitemaps;
93 11 : QStringList lines = robotsTxt.split('\n');
94 :
95 58 : for (const QString& line : lines) {
96 47 : QString trimmed = line.trimmed();
97 47 : if (trimmed.startsWith("Sitemap:", Qt::CaseInsensitive)) {
98 33 : QString urlStr = trimmed.mid(8).trimmed();
99 33 : QUrl url(urlStr);
100 33 : if (url.isValid() && stripWww(url.host()) == stripWww(siteBaseUrl.host())) {
101 32 : if (url.path().contains("news", Qt::CaseInsensitive)) {
102 12 : newsSitemaps.append(url);
103 : } else {
104 20 : genericSitemaps.append(url);
105 : }
106 : }
107 33 : }
108 47 : }
109 :
110 : // News-specific sitemaps first, then generic ones (which may be sitemap
111 : // indexes that reference a news sitemap, e.g. ESPN's /sitemap.xml).
112 11 : newsSitemaps.append(genericSitemaps);
113 22 : return newsSitemaps;
114 11 : }
115 :
116 0 : void GoogleNewsSitemapSynthesizer::onRobotsTxtDownloaded(const QUrl& url, const QByteArray& data)
117 : {
118 : Q_UNUSED(url);
119 :
120 0 : QString robotsTxt = QString::fromUtf8(data);
121 0 : QList<QUrl> robotsSitemaps = parseRobotsSitemaps(robotsTxt, siteBaseUrl);
122 :
123 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: found" << robotsSitemaps.size()
124 0 : << "news sitemaps in robots.txt";
125 :
126 0 : buildCandidateUrls(robotsSitemaps);
127 0 : tryNextCandidate();
128 0 : }
129 :
130 3 : void GoogleNewsSitemapSynthesizer::onRobotsTxtDownloadError(const QUrl& url, const QString& errorString)
131 : {
132 : Q_UNUSED(url);
133 6 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: robots.txt fetch failed:" << errorString
134 3 : << ", trying well-known paths";
135 :
136 3 : buildCandidateUrls({});
137 3 : tryNextCandidate();
138 3 : }
139 :
140 3 : void GoogleNewsSitemapSynthesizer::buildCandidateUrls(const QList<QUrl>& robotsSitemaps)
141 : {
142 3 : candidateUrls.clear();
143 :
144 : // Robots.txt sitemaps first (most reliable).
145 3 : for (const QUrl& url : robotsSitemaps) {
146 0 : candidateUrls.append(url);
147 : }
148 :
149 : // Then well-known paths as fallback.
150 12 : for (const QString& path : newsSitemapPaths()) {
151 9 : QUrl candidate = siteBaseUrl;
152 9 : candidate.setPath(path);
153 : // Avoid duplicates from robots.txt.
154 9 : if (!candidateUrls.contains(candidate)) {
155 9 : candidateUrls.append(candidate);
156 : }
157 12 : }
158 :
159 6 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: probing" << candidateUrls.size()
160 3 : << "candidate URLs";
161 3 : }
162 :
163 12 : void GoogleNewsSitemapSynthesizer::tryNextCandidate()
164 : {
165 12 : if (candidateUrls.isEmpty()) {
166 3 : reportError("No feed found");
167 3 : return;
168 : }
169 :
170 9 : QUrl url = candidateUrls.takeFirst();
171 18 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: trying" << url;
172 :
173 9 : downloader = new NetworkDownloadCore({}, this, nullptr);
174 9 : connect(downloader, &NetworkDownloadCore::finished,
175 9 : this, &GoogleNewsSitemapSynthesizer::onCandidateDownloaded);
176 9 : connect(downloader, &NetworkDownloadCore::error,
177 9 : this, &GoogleNewsSitemapSynthesizer::onCandidateDownloadError);
178 9 : downloader->download(url);
179 9 : }
180 :
181 0 : void GoogleNewsSitemapSynthesizer::onCandidateDownloaded(const QUrl& url, const QByteArray& data)
182 : {
183 0 : QString xml = QString::fromUtf8(data);
184 0 : SitemapParser parser(this);
185 0 : SitemapParser::SitemapType type = parser.parse(xml);
186 :
187 0 : if (type == SitemapParser::Invalid) {
188 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: invalid XML from" << url
189 0 : << ", trying next candidate";
190 0 : tryNextCandidate();
191 0 : return;
192 : }
193 :
194 0 : if (type == SitemapParser::SitemapIndex) {
195 : // Store sub-sitemaps sorted by lastmod descending (most recent first).
196 0 : sitemapIndexUrl = url;
197 0 : accumulatedEntries.clear();
198 0 : pendingSubSitemaps = parser.subSitemaps();
199 0 : if (pendingSubSitemaps.isEmpty()) {
200 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: empty sitemap index from" << url;
201 0 : tryNextCandidate();
202 0 : return;
203 : }
204 :
205 0 : std::sort(pendingSubSitemaps.begin(), pendingSubSitemaps.end(),
206 0 : [](const SubSitemap& a, const SubSitemap& b) {
207 0 : bool aValid = a.lastmod.isValid();
208 0 : bool bValid = b.lastmod.isValid();
209 0 : if (aValid && bValid) {
210 0 : return a.lastmod > b.lastmod;
211 : }
212 0 : return aValid && !bValid;
213 : });
214 :
215 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: sitemap index with"
216 0 : << pendingSubSitemaps.size() << "sub-sitemaps";
217 :
218 0 : tryNextSubSitemap();
219 0 : return;
220 : }
221 :
222 : // UrlSet - check for Google News entries.
223 0 : if (!parser.hasNewsEntries()) {
224 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: urlset without news entries from" << url;
225 0 : tryNextCandidate();
226 0 : return;
227 : }
228 :
229 0 : processParsedEntries(parser.entries(), url);
230 0 : }
231 :
232 9 : void GoogleNewsSitemapSynthesizer::onCandidateDownloadError(const QUrl& url, const QString& errorString)
233 : {
234 18 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: download error for" << url
235 9 : << ":" << errorString;
236 9 : tryNextCandidate();
237 9 : }
238 :
239 0 : void GoogleNewsSitemapSynthesizer::tryNextSubSitemap()
240 : {
241 0 : if (pendingSubSitemaps.isEmpty()) {
242 0 : if (!accumulatedEntries.isEmpty()) {
243 : // Deduplicate repetitive wire content, then process.
244 0 : QList<SitemapEntry> deduped = deduplicateRepetitiveTitles(accumulatedEntries);
245 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: accumulated"
246 0 : << accumulatedEntries.size() << "entries from sub-sitemaps,"
247 0 : << deduped.size() << "after dedup";
248 0 : processParsedEntries(deduped, sitemapIndexUrl);
249 0 : return;
250 0 : }
251 : // None of the sub-sitemaps had news entries.
252 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: no sub-sitemaps with news entries";
253 0 : tryNextCandidate();
254 0 : return;
255 : }
256 :
257 0 : SubSitemap sub = pendingSubSitemaps.takeFirst();
258 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: trying sub-sitemap" << sub.url;
259 :
260 0 : downloader = new NetworkDownloadCore({}, this, nullptr);
261 0 : connect(downloader, &NetworkDownloadCore::finished,
262 0 : this, &GoogleNewsSitemapSynthesizer::onSubSitemapDownloaded);
263 0 : connect(downloader, &NetworkDownloadCore::error,
264 0 : this, &GoogleNewsSitemapSynthesizer::onSubSitemapDownloadError);
265 0 : downloader->download(sub.url);
266 0 : }
267 :
268 0 : void GoogleNewsSitemapSynthesizer::onSubSitemapDownloaded(const QUrl& url, const QByteArray& data)
269 : {
270 0 : QString xml = QString::fromUtf8(data);
271 0 : SitemapParser parser(this);
272 0 : SitemapParser::SitemapType type = parser.parse(xml);
273 :
274 0 : if (type == SitemapParser::UrlSet && parser.hasNewsEntries()) {
275 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: sub-sitemap" << url
276 0 : << "has" << parser.entries().size() << "news entries, accumulating";
277 0 : accumulatedEntries.append(parser.entries());
278 : } else {
279 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: sub-sitemap" << url
280 0 : << "has no news entries, skipping";
281 : }
282 :
283 0 : tryNextSubSitemap();
284 0 : }
285 :
286 0 : void GoogleNewsSitemapSynthesizer::onSubSitemapDownloadError(const QUrl& url, const QString& errorString)
287 : {
288 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: sub-sitemap download error for" << url
289 0 : << ":" << errorString;
290 0 : tryNextSubSitemap();
291 0 : }
292 :
293 0 : QString GoogleNewsSitemapSynthesizer::normalizeLanguage(const QString& lang)
294 : {
295 : // Normalize ISO 639-3 codes to ISO 639-1. Sites use both:
296 : // AP News uses "eng"/"spa", BBC uses "en"/"bn"/"hi".
297 : static const QMap<QString, QString> iso639_3to1 = {
298 : {"eng", "en"}, {"spa", "es"}, {"fra", "fr"}, {"deu", "de"},
299 : {"por", "pt"}, {"ita", "it"}, {"jpn", "ja"}, {"zho", "zh"},
300 : {"kor", "ko"}, {"ara", "ar"}, {"hin", "hi"}, {"rus", "ru"}
301 0 : };
302 :
303 0 : QString normalized = lang.toLower().section('-', 0, 0);
304 0 : if (iso639_3to1.contains(normalized)) {
305 0 : return iso639_3to1.value(normalized);
306 : }
307 0 : return normalized;
308 0 : }
309 :
310 0 : void GoogleNewsSitemapSynthesizer::processParsedEntries(const QList<SitemapEntry>& entries,
311 : const QUrl& sourceUrl)
312 : {
313 0 : feedSourceUrl = sourceUrl;
314 :
315 : // Filter to only entries with news:title.
316 0 : QList<SitemapEntry> newsEntries;
317 0 : for (const SitemapEntry& entry : entries) {
318 0 : if (!entry.newsTitle.isEmpty()) {
319 0 : newsEntries.append(entry);
320 : }
321 : }
322 :
323 0 : if (newsEntries.isEmpty()) {
324 0 : reportError("No feed found");
325 0 : return;
326 : }
327 :
328 : // Use the publication name as the feed title if we don't have a better one.
329 0 : if (!newsEntries.first().publicationName.isEmpty()) {
330 0 : QString pubName = newsEntries.first().publicationName;
331 : // Only override generic/empty titles (host names, etc.)
332 0 : if (feedTitle.isEmpty() || feedTitle == siteBaseUrl.host()) {
333 0 : feedTitle = pubName;
334 : }
335 0 : }
336 :
337 : // Filter by language: find the majority language and keep only matching entries.
338 0 : QMap<QString, int> langCounts;
339 0 : for (const SitemapEntry& entry : newsEntries) {
340 0 : if (!entry.language.isEmpty()) {
341 0 : langCounts[normalizeLanguage(entry.language)]++;
342 : }
343 : }
344 :
345 0 : if (!langCounts.isEmpty()) {
346 0 : QString majorityLang;
347 0 : int maxCount = 0;
348 0 : for (auto it = langCounts.cbegin(); it != langCounts.cend(); ++it) {
349 0 : if (it.value() > maxCount) {
350 0 : maxCount = it.value();
351 0 : majorityLang = it.key();
352 : }
353 : }
354 :
355 0 : QList<SitemapEntry> filtered;
356 0 : for (const SitemapEntry& entry : newsEntries) {
357 0 : if (entry.language.isEmpty()
358 0 : || normalizeLanguage(entry.language) == majorityLang) {
359 0 : filtered.append(entry);
360 : }
361 : }
362 0 : newsEntries = filtered;
363 :
364 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: filtered to language"
365 0 : << majorityLang << "(" << newsEntries.size() << "entries)";
366 0 : }
367 :
368 : // Sort by publication date descending.
369 0 : std::sort(newsEntries.begin(), newsEntries.end(),
370 0 : [](const SitemapEntry& a, const SitemapEntry& b) {
371 : // Prefer publicationDate, fall back to lastmod.
372 0 : QDateTime dateA = a.publicationDate.isValid() ? a.publicationDate : a.lastmod;
373 0 : QDateTime dateB = b.publicationDate.isValid() ? b.publicationDate : b.lastmod;
374 0 : bool aValid = dateA.isValid();
375 0 : bool bValid = dateB.isValid();
376 0 : if (aValid && bValid) {
377 0 : return dateA > dateB;
378 : }
379 0 : return aValid && !bValid;
380 0 : });
381 :
382 : // Filter by since date for refresh.
383 0 : if (isRefresh && since.isValid()) {
384 0 : QList<SitemapEntry> recent;
385 0 : for (const SitemapEntry& entry : newsEntries) {
386 0 : QDateTime date = entry.publicationDate.isValid()
387 0 : ? entry.publicationDate : entry.lastmod;
388 0 : if (date.isValid() && date > since) {
389 0 : recent.append(entry);
390 : }
391 0 : }
392 :
393 0 : if (recent.isEmpty()) {
394 : // No new entries since last refresh - emit empty feed.
395 0 : _result = new RawFeed(this);
396 0 : _result->feedType = RawFeed::GoogleNewsSitemap;
397 0 : _result->title = feedTitle;
398 0 : _result->url = feedSourceUrl;
399 0 : emit done();
400 0 : return;
401 : }
402 :
403 0 : newsEntries = recent;
404 0 : }
405 :
406 : // Limit to MAX_ENTRIES.
407 0 : if (newsEntries.size() > MAX_ENTRIES) {
408 0 : newsEntries = newsEntries.mid(0, MAX_ENTRIES);
409 : }
410 :
411 0 : feedEntries = newsEntries;
412 :
413 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: selected" << feedEntries.size()
414 0 : << "entries from" << feedSourceUrl;
415 :
416 0 : buildRawFeed();
417 0 : emit done();
418 0 : }
419 :
420 0 : void GoogleNewsSitemapSynthesizer::buildRawFeed()
421 : {
422 0 : _result = new RawFeed(this);
423 0 : _result->feedType = RawFeed::GoogleNewsSitemap;
424 0 : _result->title = feedTitle;
425 0 : _result->url = feedSourceUrl;
426 0 : _result->siteURL = QUrl(siteBaseUrl.scheme() + "://" + siteBaseUrl.host());
427 :
428 0 : for (const SitemapEntry& entry : feedEntries) {
429 0 : auto* item = new RawNews(_result);
430 0 : item->guid = entry.url.toString();
431 0 : item->title = entry.newsTitle;
432 0 : item->url = entry.url;
433 0 : item->author = QString("");
434 0 : item->timestamp = entry.publicationDate.isValid()
435 0 : ? entry.publicationDate
436 0 : : (entry.lastmod.isValid() ? entry.lastmod : QDateTime::currentDateTime());
437 :
438 : // Embed the sitemap thumbnail image in the content so it flows
439 : // through the normal HTML sanitizer and image pipeline.
440 0 : if (entry.imageUrl.isValid()) {
441 0 : item->content = "<img src=\"" + entry.imageUrl.toString() + "\"/>";
442 : } else {
443 0 : item->content = QString("");
444 : }
445 0 : item->description = QString("");
446 :
447 0 : _result->items.append(item);
448 : }
449 :
450 0 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer: built feed with"
451 0 : << _result->items.size() << "items";
452 0 : }
453 :
454 12 : void GoogleNewsSitemapSynthesizer::setResultState(RawFeed* result, bool hasError,
455 : const QString& errorString)
456 : {
457 12 : _result = result;
458 12 : _hasError = hasError;
459 12 : _errorString = errorString;
460 12 : }
461 :
462 4 : QList<SitemapEntry> GoogleNewsSitemapSynthesizer::deduplicateRepetitiveTitles(
463 : const QList<SitemapEntry>& entries, int prefixWordCount, int repetitionThreshold)
464 : {
465 : // Group entries by their first N words (lowercased).
466 4 : QMap<QString, QList<int>> prefixGroups;
467 37 : for (int i = 0; i < entries.size(); ++i) {
468 33 : QStringList words = entries[i].newsTitle.toLower().split(' ', Qt::SkipEmptyParts);
469 33 : QString key;
470 33 : if (words.size() >= prefixWordCount) {
471 33 : key = QStringList(words.mid(0, prefixWordCount)).join(' ');
472 : } else {
473 0 : key = words.join(' ');
474 : }
475 33 : prefixGroups[key].append(i);
476 33 : }
477 :
478 : // Build a set of indices to exclude (all but the most recent in large groups).
479 4 : QSet<int> excluded;
480 22 : for (auto it = prefixGroups.cbegin(); it != prefixGroups.cend(); ++it) {
481 18 : const QList<int>& indices = it.value();
482 18 : if (indices.size() <= repetitionThreshold) {
483 16 : continue;
484 : }
485 :
486 : // Find the most recent entry in this group.
487 2 : int bestIdx = indices.first();
488 2 : QDateTime bestDate;
489 17 : for (int idx : indices) {
490 15 : QDateTime date = entries[idx].publicationDate.isValid()
491 15 : ? entries[idx].publicationDate : entries[idx].lastmod;
492 15 : if (!bestDate.isValid() || (date.isValid() && date > bestDate)) {
493 3 : bestDate = date;
494 3 : bestIdx = idx;
495 : }
496 15 : }
497 :
498 : // Exclude all but the best.
499 17 : for (int idx : indices) {
500 15 : if (idx != bestIdx) {
501 13 : excluded.insert(idx);
502 : }
503 : }
504 2 : }
505 :
506 : // Build filtered list preserving original order.
507 4 : QList<SitemapEntry> result;
508 37 : for (int i = 0; i < entries.size(); ++i) {
509 33 : if (!excluded.contains(i)) {
510 20 : result.append(entries[i]);
511 : }
512 : }
513 8 : return result;
514 4 : }
515 :
516 3 : void GoogleNewsSitemapSynthesizer::reportError(const QString& error)
517 : {
518 3 : _hasError = true;
519 3 : _errorString = error;
520 6 : qCDebug(logUtility) << "GoogleNewsSitemapSynthesizer error:" << error;
521 3 : emit done();
522 3 : }
|