LCOV - code coverage report
Current view: top level - lib/FangFeedDiscovery - NewsSitemapSynthesizer.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 36.4 % 387 141
Test Date: 2026-04-19 00:35:54 Functions: 50.0 % 32 16

            Line data    Source code
       1              : #include "NewsSitemapSynthesizer.h"
       2              : #include "QWebDownload.h"
       3              : #include "PageMetadataExtractor.h"
       4              : #include "WebPageGrabber.h"
       5              : #include "FeedDiscoveryLogging.h"
       6              : 
       7              : #include <algorithm>
       8              : #include <QSet>
       9              : 
      10           66 : static QString stripWww(const QString& host)
      11              : {
      12           66 :     if (host.startsWith("www.")) {
      13           56 :         return host.mid(4);
      14              :     }
      15           10 :     return host;
      16              : }
      17              : 
      18           14 : NewsSitemapSynthesizer::NewsSitemapSynthesizer(QObject* parent)
      19              :     : QObject(parent)
      20           14 :     , isRefresh(false)
      21           14 :     , state(IDLE)
      22           14 :     , _hasError(false)
      23           14 :     , _result(nullptr)
      24           14 :     , downloader(new QWebDownload({}, this, nullptr))
      25           42 :     , descriptionDownloader(nullptr)
      26              : {
      27           14 :     connect(downloader, &QWebDownload::finished,
      28           14 :             this, &NewsSitemapSynthesizer::onDownloadFinished);
      29           14 :     connect(downloader, &QWebDownload::error,
      30           14 :             this, &NewsSitemapSynthesizer::onDownloadError);
      31           14 : }
      32              : 
      33           16 : NewsSitemapSynthesizer::~NewsSitemapSynthesizer()
      34              : {
      35           16 : }
      36              : 
      37            3 : QStringList NewsSitemapSynthesizer::newsSitemapPaths()
      38              : {
      39              :     return {
      40              :         "/news-sitemap.xml",
      41              :         "/sitemap_news.xml",
      42              :         "/news-sitemap-content.xml"
      43           12 :     };
      44            3 : }
      45              : 
      46            3 : void NewsSitemapSynthesizer::synthesize(const QUrl& siteUrl, const QString& siteTitle)
      47              : {
      48            3 :     isRefresh = false;
      49            3 :     feedTitle = siteTitle;
      50            3 :     since = QDateTime();
      51              : 
      52              :     // Build base URL for probing.
      53            3 :     siteBaseUrl.setScheme(siteUrl.scheme());
      54            3 :     siteBaseUrl.setHost(siteUrl.host());
      55              : 
      56            6 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: starting discovery for" << siteBaseUrl;
      57              : 
      58              :     // Start by fetching robots.txt to discover news sitemap URLs.
      59            3 :     fetchRobotsTxt();
      60            3 : }
      61              : 
      62            0 : void NewsSitemapSynthesizer::synthesize(const QUrl& sitemapUrl, const QString& feedTitle,
      63              :                                         const QDateTime& since)
      64              : {
      65            0 :     isRefresh = true;
      66            0 :     this->feedTitle = feedTitle;
      67            0 :     this->since = since;
      68              : 
      69              :     // For refresh, we already know the exact sitemap URL.
      70            0 :     candidateUrls.clear();
      71            0 :     candidateUrls.append(sitemapUrl);
      72              : 
      73            0 :     siteBaseUrl.setScheme(sitemapUrl.scheme());
      74            0 :     siteBaseUrl.setHost(sitemapUrl.host());
      75              : 
      76            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: refreshing from" << sitemapUrl
      77            0 :                         << "since" << since;
      78              : 
      79            0 :     tryNextCandidate();
      80            0 : }
      81              : 
      82            3 : void NewsSitemapSynthesizer::fetchRobotsTxt()
      83              : {
      84            3 :     QUrl robotsUrl = siteBaseUrl;
      85            3 :     robotsUrl.setPath("/robots.txt");
      86              : 
      87            6 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: fetching" << robotsUrl;
      88              : 
      89            3 :     state = FETCHING_ROBOTS_TXT;
      90            3 :     downloader->get(robotsUrl);
      91            3 : }
      92              : 
      93           11 : QList<QUrl> NewsSitemapSynthesizer::parseRobotsSitemaps(const QString& robotsTxt,
      94              :                                                          const QUrl& siteBaseUrl)
      95              : {
      96           11 :     QList<QUrl> newsSitemaps;
      97           11 :     QList<QUrl> genericSitemaps;
      98           11 :     QStringList lines = robotsTxt.split('\n');
      99              : 
     100           58 :     for (const QString& line : lines) {
     101           47 :         QString trimmed = line.trimmed();
     102           47 :         if (trimmed.startsWith("Sitemap:", Qt::CaseInsensitive)) {
     103           33 :             QString urlStr = trimmed.mid(8).trimmed();
     104           33 :             QUrl url(urlStr);
     105           33 :             if (url.isValid() && stripWww(url.host()) == stripWww(siteBaseUrl.host())) {
     106           32 :                 if (url.path().contains("news", Qt::CaseInsensitive)) {
     107           12 :                     newsSitemaps.append(url);
     108              :                 } else {
     109           20 :                     genericSitemaps.append(url);
     110              :                 }
     111              :             }
     112           33 :         }
     113           47 :     }
     114              : 
     115              :     // News-specific sitemaps first, then generic ones (which may be sitemap
     116              :     // indexes that reference a news sitemap, e.g. ESPN's /sitemap.xml).
     117           11 :     newsSitemaps.append(genericSitemaps);
     118           22 :     return newsSitemaps;
     119           11 : }
     120              : 
     121            0 : void NewsSitemapSynthesizer::onDownloadFinished(const QUrl& url, const QByteArray& data)
     122              : {
     123            0 :     switch (state) {
     124            0 :     case FETCHING_ROBOTS_TXT:
     125            0 :         handleRobotsTxtResponse(url, data);
     126            0 :         break;
     127            0 :     case FETCHING_CANDIDATE:
     128            0 :         handleCandidateResponse(url, data);
     129            0 :         break;
     130            0 :     case FETCHING_SUB_SITEMAP:
     131            0 :         handleSubSitemapResponse(url, data);
     132            0 :         break;
     133            0 :     case IDLE:
     134              :     case FETCHING_DESCRIPTIONS:
     135            0 :         break;
     136              :     }
     137            0 : }
     138              : 
     139           12 : void NewsSitemapSynthesizer::onDownloadError(const QUrl& url, const QString& errorString)
     140              : {
     141           12 :     switch (state) {
     142            3 :     case FETCHING_ROBOTS_TXT:
     143            3 :         handleRobotsTxtError(url, errorString);
     144            3 :         break;
     145            9 :     case FETCHING_CANDIDATE:
     146            9 :         handleCandidateError(url, errorString);
     147            9 :         break;
     148            0 :     case FETCHING_SUB_SITEMAP:
     149            0 :         handleSubSitemapError(url, errorString);
     150            0 :         break;
     151            0 :     case IDLE:
     152              :     case FETCHING_DESCRIPTIONS:
     153            0 :         break;
     154              :     }
     155           12 : }
     156              : 
     157            0 : void NewsSitemapSynthesizer::handleRobotsTxtResponse(const QUrl& url, const QByteArray& data)
     158              : {
     159              :     Q_UNUSED(url);
     160              : 
     161            0 :     QString robotsTxt = QString::fromUtf8(data);
     162            0 :     QList<QUrl> robotsSitemaps = parseRobotsSitemaps(robotsTxt, siteBaseUrl);
     163              : 
     164            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: found" << robotsSitemaps.size()
     165            0 :                         << "news sitemaps in robots.txt";
     166              : 
     167            0 :     buildCandidateUrls(robotsSitemaps);
     168            0 :     tryNextCandidate();
     169            0 : }
     170              : 
     171            3 : void NewsSitemapSynthesizer::handleRobotsTxtError(const QUrl& url, const QString& errorString)
     172              : {
     173              :     Q_UNUSED(url);
     174            6 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: robots.txt fetch failed:" << errorString
     175            3 :                         << ", trying well-known paths";
     176              : 
     177            3 :     buildCandidateUrls({});
     178            3 :     tryNextCandidate();
     179            3 : }
     180              : 
     181            3 : void NewsSitemapSynthesizer::buildCandidateUrls(const QList<QUrl>& robotsSitemaps)
     182              : {
     183            3 :     candidateUrls.clear();
     184              : 
     185              :     // Robots.txt sitemaps first (most reliable).
     186            3 :     for (const QUrl& url : robotsSitemaps) {
     187            0 :         candidateUrls.append(url);
     188              :     }
     189              : 
     190              :     // Then well-known paths as fallback.
     191           12 :     for (const QString& path : newsSitemapPaths()) {
     192            9 :         QUrl candidate = siteBaseUrl;
     193            9 :         candidate.setPath(path);
     194              :         // Avoid duplicates from robots.txt.
     195            9 :         if (!candidateUrls.contains(candidate)) {
     196            9 :             candidateUrls.append(candidate);
     197              :         }
     198           12 :     }
     199              : 
     200            6 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: probing" << candidateUrls.size()
     201            3 :                         << "candidate URLs";
     202            3 : }
     203              : 
     204           12 : void NewsSitemapSynthesizer::tryNextCandidate()
     205              : {
     206           12 :     if (candidateUrls.isEmpty()) {
     207            3 :         reportError("No feed found");
     208            3 :         return;
     209              :     }
     210              : 
     211            9 :     QUrl url = candidateUrls.takeFirst();
     212           18 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: trying" << url;
     213              : 
     214            9 :     state = FETCHING_CANDIDATE;
     215            9 :     downloader->get(url);
     216            9 : }
     217              : 
     218            0 : void NewsSitemapSynthesizer::handleCandidateResponse(const QUrl& url, const QByteArray& data)
     219              : {
     220            0 :     SitemapParser parser;
     221            0 :     SitemapParser::SitemapType type = parser.parse(data);
     222              : 
     223            0 :     if (type == SitemapParser::Invalid) {
     224            0 :         qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: invalid XML from" << url
     225            0 :                             << ", trying next candidate";
     226            0 :         tryNextCandidate();
     227            0 :         return;
     228              :     }
     229              : 
     230            0 :     if (type == SitemapParser::SitemapIndex) {
     231              :         // Store sub-sitemaps sorted by lastmod descending (most recent first).
     232            0 :         sitemapIndexUrl = url;
     233            0 :         accumulatedEntries.clear();
     234            0 :         pendingSubSitemaps = parser.subSitemaps();
     235            0 :         if (pendingSubSitemaps.isEmpty()) {
     236            0 :             qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: empty sitemap index from" << url;
     237            0 :             tryNextCandidate();
     238            0 :             return;
     239              :         }
     240              : 
     241            0 :         std::sort(pendingSubSitemaps.begin(), pendingSubSitemaps.end(),
     242            0 :             [](const SubSitemap& a, const SubSitemap& b) {
     243            0 :                 bool aValid = a.lastmod.isValid();
     244            0 :                 bool bValid = b.lastmod.isValid();
     245            0 :                 if (aValid && bValid) {
     246            0 :                     return a.lastmod > b.lastmod;
     247              :                 }
     248            0 :                 return aValid && !bValid;
     249              :             });
     250              : 
     251            0 :         qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: sitemap index with"
     252            0 :                             << pendingSubSitemaps.size() << "sub-sitemaps";
     253              : 
     254            0 :         tryNextSubSitemap();
     255            0 :         return;
     256              :     }
     257              : 
     258              :     // UrlSet - check for news entries.
     259            0 :     if (!parser.hasNewsEntries()) {
     260            0 :         qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: urlset without news entries from" << url;
     261            0 :         tryNextCandidate();
     262            0 :         return;
     263              :     }
     264              : 
     265            0 :     processParsedEntries(parser.entries(), url);
     266            0 : }
     267              : 
     268            9 : void NewsSitemapSynthesizer::handleCandidateError(const QUrl& url, const QString& errorString)
     269              : {
     270           18 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: download error for" << url
     271            9 :                         << ":" << errorString;
     272            9 :     tryNextCandidate();
     273            9 : }
     274              : 
     275            0 : void NewsSitemapSynthesizer::tryNextSubSitemap()
     276              : {
     277            0 :     if (pendingSubSitemaps.isEmpty()) {
     278            0 :         if (!accumulatedEntries.isEmpty()) {
     279              :             // Deduplicate repetitive wire content, then process.
     280            0 :             QList<SitemapEntry> deduped = deduplicateRepetitiveTitles(accumulatedEntries);
     281            0 :             qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: accumulated"
     282            0 :                                 << accumulatedEntries.size() << "entries from sub-sitemaps,"
     283            0 :                                 << deduped.size() << "after dedup";
     284            0 :             processParsedEntries(deduped, sitemapIndexUrl);
     285            0 :             return;
     286            0 :         }
     287              :         // None of the sub-sitemaps had news entries.
     288            0 :         qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: no sub-sitemaps with news entries";
     289            0 :         tryNextCandidate();
     290            0 :         return;
     291              :     }
     292              : 
     293            0 :     SubSitemap sub = pendingSubSitemaps.takeFirst();
     294            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: trying sub-sitemap" << sub.url;
     295              : 
     296            0 :     state = FETCHING_SUB_SITEMAP;
     297            0 :     downloader->get(sub.url);
     298            0 : }
     299              : 
     300            0 : void NewsSitemapSynthesizer::handleSubSitemapResponse(const QUrl& url, const QByteArray& data)
     301              : {
     302            0 :     SitemapParser parser;
     303            0 :     SitemapParser::SitemapType type = parser.parse(data);
     304              : 
     305            0 :     if (type == SitemapParser::UrlSet && parser.hasNewsEntries()) {
     306            0 :         qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: sub-sitemap" << url
     307            0 :                             << "has" << parser.entries().size() << "news entries, accumulating";
     308            0 :         accumulatedEntries.append(parser.entries());
     309              :     } else {
     310            0 :         qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: sub-sitemap" << url
     311            0 :                             << "has no news entries, skipping";
     312              :     }
     313              : 
     314            0 :     tryNextSubSitemap();
     315            0 : }
     316              : 
     317            0 : void NewsSitemapSynthesizer::handleSubSitemapError(const QUrl& url, const QString& errorString)
     318              : {
     319            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: sub-sitemap download error for" << url
     320            0 :                         << ":" << errorString;
     321            0 :     tryNextSubSitemap();
     322            0 : }
     323              : 
     324            0 : QString NewsSitemapSynthesizer::normalizeLanguage(const QString& lang)
     325              : {
     326              :     // Normalize ISO 639-3 codes to ISO 639-1. Sites use both:
     327              :     // AP News uses "eng"/"spa", BBC uses "en"/"bn"/"hi".
     328              :     static const QMap<QString, QString> iso639_3to1 = {
     329              :         {"eng", "en"}, {"spa", "es"}, {"fra", "fr"}, {"deu", "de"},
     330              :         {"por", "pt"}, {"ita", "it"}, {"jpn", "ja"}, {"zho", "zh"},
     331              :         {"kor", "ko"}, {"ara", "ar"}, {"hin", "hi"}, {"rus", "ru"}
     332            0 :     };
     333              : 
     334            0 :     QString normalized = lang.toLower().section('-', 0, 0);
     335            0 :     if (iso639_3to1.contains(normalized)) {
     336            0 :         return iso639_3to1.value(normalized);
     337              :     }
     338            0 :     return normalized;
     339            0 : }
     340              : 
     341            0 : void NewsSitemapSynthesizer::filterByLanguage(QList<SitemapEntry>& entries)
     342              : {
     343            0 :     QMap<QString, int> langCounts;
     344            0 :     for (const SitemapEntry& entry : entries) {
     345            0 :         if (!entry.language.isEmpty()) {
     346            0 :             langCounts[normalizeLanguage(entry.language)]++;
     347              :         }
     348              :     }
     349              : 
     350            0 :     if (langCounts.isEmpty()) {
     351            0 :         return;
     352              :     }
     353              : 
     354            0 :     QString majorityLang;
     355            0 :     int maxCount = 0;
     356            0 :     for (auto it = langCounts.cbegin(); it != langCounts.cend(); ++it) {
     357            0 :         if (it.value() > maxCount) {
     358            0 :             maxCount = it.value();
     359            0 :             majorityLang = it.key();
     360              :         }
     361              :     }
     362              : 
     363            0 :     QList<SitemapEntry> filtered;
     364            0 :     for (const SitemapEntry& entry : entries) {
     365            0 :         if (entry.language.isEmpty()
     366            0 :             || normalizeLanguage(entry.language) == majorityLang) {
     367            0 :             filtered.append(entry);
     368              :         }
     369              :     }
     370            0 :     entries = filtered;
     371              : 
     372            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: filtered to language"
     373            0 :                         << majorityLang << "(" << entries.size() << "entries)";
     374            0 : }
     375              : 
     376            0 : bool NewsSitemapSynthesizer::filterBySinceDate(QList<SitemapEntry>& entries)
     377              : {
     378            0 :     if (!isRefresh || !since.isValid()) {
     379            0 :         return false;
     380              :     }
     381              : 
     382            0 :     QList<SitemapEntry> recent;
     383            0 :     for (const SitemapEntry& entry : entries) {
     384            0 :         QDateTime date = entry.publicationDate.isValid()
     385            0 :             ? entry.publicationDate : entry.lastmod;
     386            0 :         if (date.isValid() && date > since) {
     387            0 :             recent.append(entry);
     388              :         }
     389            0 :     }
     390            0 :     entries = recent;
     391            0 :     return true;
     392            0 : }
     393              : 
     394            0 : void NewsSitemapSynthesizer::processParsedEntries(const QList<SitemapEntry>& entries,
     395              :                                                    const QUrl& sourceUrl)
     396              : {
     397            0 :     feedSourceUrl = sourceUrl;
     398              : 
     399              :     // Filter to only entries with news:title.
     400            0 :     QList<SitemapEntry> newsEntries;
     401            0 :     for (const SitemapEntry& entry : entries) {
     402            0 :         if (!entry.newsTitle.isEmpty()) {
     403            0 :             newsEntries.append(entry);
     404              :         }
     405              :     }
     406              : 
     407            0 :     if (newsEntries.isEmpty()) {
     408            0 :         reportError("No feed found");
     409            0 :         return;
     410              :     }
     411              : 
     412              :     // Use the publication name as the feed title if we don't have a better one.
     413            0 :     if (!newsEntries.first().publicationName.isEmpty()) {
     414            0 :         QString pubName = newsEntries.first().publicationName;
     415            0 :         if (feedTitle.isEmpty() || feedTitle == siteBaseUrl.host()) {
     416            0 :             feedTitle = pubName;
     417              :         }
     418            0 :     }
     419              : 
     420            0 :     filterByLanguage(newsEntries);
     421              : 
     422              :     // Sort by publication date descending.
     423            0 :     std::sort(newsEntries.begin(), newsEntries.end(),
     424            0 :         [](const SitemapEntry& a, const SitemapEntry& b) {
     425            0 :             QDateTime dateA = a.publicationDate.isValid() ? a.publicationDate : a.lastmod;
     426            0 :             QDateTime dateB = b.publicationDate.isValid() ? b.publicationDate : b.lastmod;
     427            0 :             bool aValid = dateA.isValid();
     428            0 :             bool bValid = dateB.isValid();
     429            0 :             if (aValid && bValid) {
     430            0 :                 return dateA > dateB;
     431              :             }
     432            0 :             return aValid && !bValid;
     433            0 :         });
     434              : 
     435            0 :     if (filterBySinceDate(newsEntries) && newsEntries.isEmpty()) {
     436              :         // No new entries since last refresh - emit empty feed.
     437            0 :         state = IDLE;
     438            0 :         _result = std::make_shared<RawFeed>();
     439            0 :         _result->feedType = RawFeed::GoogleNewsSitemap;
     440            0 :         _result->title = feedTitle;
     441            0 :         _result->url = feedSourceUrl;
     442            0 :         emit done();
     443            0 :         return;
     444              :     }
     445              : 
     446              :     // Limit to MAX_ENTRIES.
     447            0 :     if (newsEntries.size() > MAX_ENTRIES) {
     448            0 :         newsEntries = newsEntries.mid(0, MAX_ENTRIES);
     449              :     }
     450              : 
     451            0 :     feedEntries = newsEntries;
     452              : 
     453            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: selected" << feedEntries.size()
     454            0 :                         << "entries from" << feedSourceUrl;
     455              : 
     456            0 :     fetchDescriptions();
     457            0 : }
     458              : 
     459            0 : void NewsSitemapSynthesizer::fetchDescriptions()
     460              : {
     461            0 :     state = FETCHING_DESCRIPTIONS;
     462            0 :     fetchedDescriptions.clear();
     463              : 
     464            0 :     QList<QUrl> urls;
     465            0 :     for (const SitemapEntry& entry : feedEntries) {
     466            0 :         urls.append(entry.url);
     467              :     }
     468              : 
     469            0 :     descriptionDownloader = new QBatchWebDownload(10000, 5, this);
     470            0 :     connect(descriptionDownloader, &QBatchWebDownload::finished,
     471            0 :             this, &NewsSitemapSynthesizer::onDescriptionsReady);
     472            0 :     descriptionDownloader->get(urls);
     473            0 : }
     474              : 
     475            0 : void NewsSitemapSynthesizer::onDescriptionsReady()
     476              : {
     477            0 :     auto results = descriptionDownloader->results();
     478            0 :     descriptionDownloader->deleteLater();
     479            0 :     descriptionDownloader = nullptr;
     480              : 
     481            0 :     for (auto it = results.constBegin(); it != results.constEnd(); ++it) {
     482            0 :         if (!it.value().success) {
     483            0 :             continue;
     484              :         }
     485              : 
     486            0 :         QString xhtml = WebPageGrabber::htmlToXhtml(it.value().data);
     487            0 :         if (xhtml.isEmpty()) {
     488            0 :             continue;
     489              :         }
     490              : 
     491            0 :         PageMetadata meta = PageMetadataExtractor::extract(xhtml);
     492            0 :         if (!meta.description.isEmpty()) {
     493            0 :             fetchedDescriptions[it.key()] = meta.description;
     494              :         }
     495            0 :     }
     496              : 
     497            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: fetched" << fetchedDescriptions.size()
     498            0 :                         << "descriptions from" << results.size() << "articles";
     499              : 
     500            0 :     state = IDLE;
     501            0 :     buildRawFeed();
     502            0 :     emit done();
     503            0 : }
     504              : 
     505            0 : void NewsSitemapSynthesizer::buildRawFeed()
     506              : {
     507            0 :     _result = std::make_shared<RawFeed>();
     508            0 :     _result->feedType = RawFeed::GoogleNewsSitemap;
     509            0 :     _result->title = feedTitle;
     510            0 :     _result->subtitle = "";
     511            0 :     _result->url = feedSourceUrl;
     512            0 :     _result->siteURL = QUrl(siteBaseUrl.scheme() + "://" + siteBaseUrl.host());
     513              : 
     514            0 :     for (const SitemapEntry& entry : feedEntries) {
     515            0 :         auto item = std::make_shared<RawNews>();
     516            0 :         item->guid = entry.url.toString();
     517            0 :         item->title = entry.newsTitle;
     518            0 :         item->author = entry.publicationName;
     519            0 :         item->url = entry.url;
     520            0 :         item->timestamp = entry.publicationDate.isValid()
     521            0 :             ? entry.publicationDate
     522            0 :             : (entry.lastmod.isValid() ? entry.lastmod : QDateTime::currentDateTime());
     523              : 
     524            0 :         if (fetchedDescriptions.contains(entry.url)) {
     525            0 :             item->description = fetchedDescriptions.value(entry.url);
     526              :         }
     527              : 
     528              :         // Embed the sitemap thumbnail image in the content so it flows
     529              :         // through the normal HTML sanitizer and image pipeline.
     530            0 :         if (entry.imageUrl.isValid()) {
     531            0 :             item->content = "<img src=\"" + entry.imageUrl.toString() + "\"/>";
     532              :         }
     533              : 
     534            0 :         _result->items.append(item);
     535            0 :     }
     536              : 
     537            0 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer: built feed with"
     538            0 :                         << _result->items.size() << "items";
     539            0 : }
     540              : 
     541           12 : void NewsSitemapSynthesizer::setResultState(std::shared_ptr<RawFeed> result, bool hasError,
     542              :                                                    const QString& errorString)
     543              : {
     544           12 :     _result = result;
     545           12 :     _hasError = hasError;
     546           12 :     _errorString = errorString;
     547           12 : }
     548              : 
     549            4 : QList<SitemapEntry> NewsSitemapSynthesizer::deduplicateRepetitiveTitles(
     550              :     const QList<SitemapEntry>& entries, int prefixWordCount, int repetitionThreshold)
     551              : {
     552              :     // Group entries by their first N words (lowercased).
     553            4 :     QMap<QString, QList<int>> prefixGroups;
     554           37 :     for (int i = 0; i < entries.size(); ++i) {
     555           33 :         QStringList words = entries[i].newsTitle.toLower().split(' ', Qt::SkipEmptyParts);
     556           33 :         QString key;
     557           33 :         if (words.size() >= prefixWordCount) {
     558           33 :             key = QStringList(words.mid(0, prefixWordCount)).join(' ');
     559              :         } else {
     560            0 :             key = words.join(' ');
     561              :         }
     562           33 :         prefixGroups[key].append(i);
     563           33 :     }
     564              : 
     565              :     // Build a set of indices to exclude (all but the most recent in large groups).
     566            4 :     QSet<int> excluded;
     567           22 :     for (auto it = prefixGroups.cbegin(); it != prefixGroups.cend(); ++it) {
     568           18 :         const QList<int>& indices = it.value();
     569           18 :         if (indices.size() <= repetitionThreshold) {
     570           16 :             continue;
     571              :         }
     572              : 
     573              :         // Find the most recent entry in this group.
     574            2 :         int bestIdx = indices.first();
     575            2 :         QDateTime bestDate;
     576           17 :         for (int idx : indices) {
     577           15 :             QDateTime date = entries[idx].publicationDate.isValid()
     578           15 :                 ? entries[idx].publicationDate : entries[idx].lastmod;
     579           15 :             if (!bestDate.isValid() || (date.isValid() && date > bestDate)) {
     580            3 :                 bestDate = date;
     581            3 :                 bestIdx = idx;
     582              :             }
     583           15 :         }
     584              : 
     585              :         // Exclude all but the best.
     586           17 :         for (int idx : indices) {
     587           15 :             if (idx != bestIdx) {
     588           13 :                 excluded.insert(idx);
     589              :             }
     590              :         }
     591            2 :     }
     592              : 
     593              :     // Build filtered list preserving original order.
     594            4 :     QList<SitemapEntry> result;
     595           37 :     for (int i = 0; i < entries.size(); ++i) {
     596           33 :         if (!excluded.contains(i)) {
     597           20 :             result.append(entries[i]);
     598              :         }
     599              :     }
     600            8 :     return result;
     601            4 : }
     602              : 
     603            3 : void NewsSitemapSynthesizer::reportError(const QString& error)
     604              : {
     605            3 :     state = IDLE;
     606            3 :     _hasError = true;
     607            3 :     _errorString = error;
     608            6 :     qCDebug(logFeedDiscovery) << "NewsSitemapSynthesizer error:" << error;
     609            3 :     emit done();
     610            3 : }
        

Generated by: LCOV version 2.0-1