Line data Source code
1 : #include "RSSAtomParser.h"
2 : #include "FeedDateParser.h"
3 : #include "FeedParserLogging.h"
4 :
5 : // Returns true if the string contains only whitespace (or is empty).
6 7912 : static bool isBlankOrEmpty(const QString& s)
7 : {
8 7912 : return QStringView(s).trimmed().isEmpty();
9 : }
10 :
11 : // Some feeds (e.g. excelsior.com.mx) double-escape CDATA markers, producing
12 : // literal "<![CDATA[...]]>" text instead of actual CDATA sections. Strip them.
13 6705 : static QString stripEscapedCDATA(const QString& text)
14 : {
15 6705 : QStringView view = QStringView(text).trimmed();
16 6706 : if (view.startsWith(u"<![CDATA[") && view.endsWith(u"]]>")) {
17 1 : return view.mid(9, view.size() - 12).trimmed().toString();
18 : }
19 6704 : return text;
20 : }
21 :
22 96 : std::unique_ptr<RawFeed> RSSAtomParser::parse(const QByteArray& data)
23 : {
24 96 : RSSAtomParser worker;
25 96 : worker.feed = std::make_unique<RawFeed>();
26 96 : worker.isValid = true;
27 :
28 96 : worker.xml.addData(data);
29 101720 : while (!worker.xml.atEnd()) {
30 101624 : worker.xml.readNext();
31 101624 : if (worker.xml.isStartElement()) {
32 27290 : worker.elementStart();
33 74334 : } else if (worker.xml.isEndElement()) {
34 27287 : worker.elementEnd();
35 47047 : } else if (worker.xml.isCharacters() && !worker.xml.isWhitespace()) {
36 20387 : worker.elementContents();
37 : }
38 : }
39 :
40 96 : if (worker.xml.error()
41 3 : && worker.xml.error() != QXmlStreamReader::PrematureEndOfDocumentError
42 99 : && worker.xml.error() != QXmlStreamReader::NotWellFormedError) {
43 0 : worker.isValid = false;
44 0 : qCWarning(logFeedParser) << "XML ERROR:" << worker.xml.lineNumber()
45 0 : << ": " << worker.xml.errorString();
46 : }
47 :
48 96 : if (!worker.isValid) {
49 0 : return nullptr;
50 : }
51 :
52 96 : if (worker.feed->items.size() == 0) {
53 4 : worker.saveSummary();
54 : }
55 :
56 : // No items and no feed title means we never found RSS/Atom content.
57 96 : if (worker.feed->items.isEmpty() && worker.feed->title.isEmpty()) {
58 2 : return nullptr;
59 : }
60 :
61 94 : return std::move(worker.feed);
62 96 : }
63 :
64 27290 : void RSSAtomParser::elementStart()
65 : {
66 27290 : QString tagName = xml.name().toString().toLower();
67 :
68 27290 : if ((tagName == "item" || tagName == "entry") && !state.inAtomXHTML) {
69 :
70 2238 : if (state.urlHref.isEmpty()) {
71 4390 : state.urlHref = xml.attributes().value("rss:about").toString();
72 : }
73 :
74 2238 : if (state.numItems == 0) {
75 93 : saveSummary();
76 : }
77 :
78 2238 : currentItem = std::make_shared<RawNews>();
79 2238 : state.numItems++;
80 25874 : } else if ((tagName == "content" || tagName == "summary") &&
81 26696 : xml.attributes().value("type").toString().toLower() == "xhtml") {
82 99 : state.inAtomXHTML = true;
83 99 : state.content.reserve(4096);
84 24953 : } else if (state.inAtomXHTML) {
85 : // Rebuild the XHTML tag directly into content (no temporaries).
86 2802 : state.content.append('<');
87 2802 : state.content.append(xml.qualifiedName());
88 4204 : for (const auto& attribute : xml.attributes()) {
89 1402 : state.content.append(' ');
90 1402 : state.content.append(attribute.name());
91 2804 : state.content.append(QStringLiteral("=\""));
92 1402 : state.content.append(attribute.value());
93 1402 : state.content.append('"');
94 2802 : }
95 2802 : state.content.append('>');
96 :
97 2802 : return;
98 : }
99 :
100 24488 : state.currentTag = tagName;
101 24488 : state.currentPrefix = xml.prefix().toString().toLower();
102 48976 : state.hasType = xml.attributes().hasAttribute("type");
103 :
104 : // Podcast detection: only flag itunes elements that are specific to actual
105 : // podcast feeds. Many non-podcast feeds (e.g. Substack blogs) include generic
106 : // itunes metadata like itunes:owner, itunes:author, and itunes:block.
107 24488 : if (state.currentPrefix == "itunes") {
108 1441 : if (state.currentTag == "duration" || state.currentTag == "episode"
109 655 : || state.currentTag == "episodetype" || state.currentTag == "season"
110 1441 : || state.currentTag == "explicit" || state.currentTag == "category") {
111 264 : state.hasPodcastSignals = true;
112 : }
113 : }
114 :
115 : // Podcast detection: audio enclosures.
116 24488 : if (state.currentTag == "enclosure") {
117 796 : QString type = xml.attributes().value("type").toString().toLower();
118 398 : if (type.startsWith("audio/")) {
119 254 : state.hasPodcastSignals = true;
120 : }
121 398 : }
122 :
123 : // Media RSS image extraction (media:thumbnail and media:content).
124 24488 : if (currentItem && state.currentPrefix == "media") {
125 1444 : if (state.currentTag == "thumbnail") {
126 1206 : QString url = xml.attributes().value("url").toString();
127 1206 : int width = xml.attributes().value("width").toString().toInt();
128 603 : if (!url.isEmpty() && (state.mediaImageURL.isEmpty() || width > state.mediaImageWidth)) {
129 603 : state.mediaImageURL = url;
130 603 : state.mediaImageWidth = width;
131 : }
132 1444 : } else if (state.currentTag == "content") {
133 834 : QString type = xml.attributes().value("type").toString().toLower();
134 417 : if (type.startsWith("image/")) {
135 470 : QString url = xml.attributes().value("url").toString();
136 470 : int width = xml.attributes().value("width").toString().toInt();
137 235 : if (!url.isEmpty() && (state.mediaImageURL.isEmpty() || width > state.mediaImageWidth)) {
138 173 : state.mediaImageURL = url;
139 173 : state.mediaImageWidth = width;
140 : }
141 235 : }
142 417 : }
143 : }
144 :
145 26896 : if (state.currentTag == "link" && state.urlHref.isEmpty() && xml.attributes().hasAttribute("href")) {
146 444 : state.urlHref = xml.attributes().value("href").toString();
147 : }
148 :
149 24488 : state.tagStack.push(tagName);
150 27290 : }
151 :
152 27287 : void RSSAtomParser::elementEnd()
153 : {
154 27287 : if (!state.inAtomXHTML) {
155 24386 : state.tagStack.pop();
156 : }
157 :
158 27287 : QString tagName = xml.name().toString().toLower();
159 :
160 27287 : if ((tagName == "item" || tagName == "entry") && !state.inAtomXHTML) {
161 2237 : if (!currentItem) {
162 2 : qCWarning(logFeedParser) << "Current item is null!";
163 2 : qCWarning(logFeedParser) << "Current title: " << state.title;
164 2 : qCWarning(logFeedParser) << "Xml element: " << tagName;
165 2 : return;
166 : }
167 :
168 : // Figure out which date to use.
169 2236 : QString timestamp;
170 2236 : if (!isBlankOrEmpty(state.pubdate)) {
171 1986 : timestamp = state.pubdate;
172 250 : } else if (!isBlankOrEmpty(state.lastbuilddate)) {
173 1 : timestamp = state.lastbuilddate;
174 249 : } else if (!isBlankOrEmpty(state.created)) {
175 15 : timestamp = state.created;
176 234 : } else if (!isBlankOrEmpty(state.date)) {
177 69 : timestamp = state.date;
178 165 : } else if (!isBlankOrEmpty(state.updated)) {
179 164 : timestamp = state.updated;
180 : }
181 :
182 : // Determine the GUID.
183 2236 : QString myGuid;
184 2236 : if (!isBlankOrEmpty(state.id)) {
185 179 : myGuid = state.id.trimmed();
186 2057 : } else if (!isBlankOrEmpty(state.guid)) {
187 1572 : myGuid = state.guid.trimmed();
188 485 : } else if (!isBlankOrEmpty(state.urlData)) {
189 484 : myGuid = state.urlData.trimmed();
190 : } else {
191 1 : myGuid = state.urlHref.trimmed();
192 : }
193 :
194 : // Skip items without a GUID - malformed feed
195 2236 : if (myGuid.isEmpty()) {
196 2 : qCWarning(logFeedParser) << "RSSAtomParser: RSS/Atom item missing GUID/URL, skipping item";
197 2 : qCWarning(logFeedParser) << " Title:" << state.title;
198 1 : currentItem.reset();
199 1 : state.clearItemFields();
200 1 : return;
201 : }
202 :
203 : // Item space.
204 2235 : currentItem->author = state.author;
205 2235 : currentItem->title = stripEscapedCDATA(state.title);
206 2235 : currentItem->description = stripEscapedCDATA(state.subtitle);
207 2235 : currentItem->content = stripEscapedCDATA(state.content);
208 :
209 2235 : currentItem->mediaImageURL = state.mediaImageURL;
210 :
211 2235 : currentItem->url = state.urlData.isEmpty() ? QUrl(state.urlHref) : QUrl(state.urlData);
212 2235 : currentItem->timestamp = FeedDateParser::dateFromFeedString(timestamp);
213 2235 : currentItem->guid = myGuid;
214 :
215 2235 : if (!currentItem->timestamp.isValid()) {
216 2 : qCDebug(logFeedParser) << "Time string: " << timestamp;
217 2 : qCDebug(logFeedParser) << "invalid date!";
218 : }
219 :
220 :
221 2235 : feed->items.append(currentItem);
222 2235 : feed->isPodcast = feed->isPodcast || state.hasPodcastSignals;
223 2235 : currentItem.reset();
224 :
225 2235 : state.clearItemFields();
226 27287 : } else if (tagName == "content" || tagName == "summary") {
227 822 : if (state.inAtomXHTML) {
228 99 : state.inAtomXHTML = false;
229 99 : state.tagStack.pop();
230 : }
231 : }
232 :
233 27285 : if (state.inAtomXHTML) {
234 5604 : state.content.append(QStringLiteral("</"));
235 2802 : state.content.append(xml.qualifiedName());
236 2802 : state.content.append('>');
237 : }
238 27287 : }
239 :
240 20387 : void RSSAtomParser::elementContents()
241 : {
242 20387 : if (state.inAtomXHTML) {
243 1860 : state.content += xml.text().toString();
244 1860 : return;
245 : }
246 :
247 18527 : QStringView parentTag = getTagStackAt(1);
248 20866 : if (parentTag == u"item" || parentTag == u"entry") {
249 : //
250 : // Inside a news item.
251 : //
252 :
253 16989 : if (state.currentTag == "title" && state.currentPrefix == "") {
254 2235 : state.title += xml.text().toString();
255 14754 : } else if (state.currentTag == "link" && state.currentPrefix == "") {
256 2057 : state.urlData += xml.text().toString();
257 23411 : } else if ((state.currentTag == "description" || state.currentTag == "summary")
258 23411 : && state.currentPrefix == "") {
259 2106 : state.subtitle += xml.text().toString();
260 10591 : } else if (state.currentTag == "name"
261 10591 : || (state.currentTag == "creator" && state.currentPrefix == "dc")) {
262 671 : state.author += xml.text().toString();
263 9920 : } else if (state.currentTag == "pubdate") {
264 1986 : state.pubdate += xml.text().toString();
265 7934 : } else if (state.currentTag == "lastbuilddate") {
266 1 : state.lastbuilddate += xml.text().toString();
267 7933 : } else if (state.currentTag == "created") {
268 15 : state.created += xml.text().toString();
269 7918 : } else if (state.currentTag == "updated") {
270 189 : state.updated += xml.text().toString();
271 7729 : } else if (state.currentTag == "date") {
272 154 : state.date += xml.text().toString();
273 7575 : } else if (state.currentTag == "guid") {
274 1572 : state.guid += xml.text().toString();
275 6003 : } else if (state.currentTag == "id") {
276 179 : state.id += xml.text().toString();
277 6089 : } else if ((state.currentTag == "encoded" && state.currentPrefix == "content")
278 6089 : || (state.currentTag == "content" && state.hasType)) {
279 320 : state.content += xml.text().toString();
280 : }
281 2563 : } else if (parentTag == u"channel" || parentTag == u"feed") {
282 : //
283 : // Top level items.
284 : //
285 :
286 547 : if (state.currentTag == "title" && state.currentPrefix == "") {
287 94 : state.title += xml.text().toString();
288 453 : } else if (state.currentTag == "link" && state.currentPrefix == "") {
289 85 : state.urlData += xml.text().toString();
290 679 : } else if ((state.currentTag == "description" || state.currentTag == "summary")
291 679 : && state.currentPrefix == "") {
292 57 : state.subtitle += xml.text().toString();
293 : }
294 : }
295 : }
296 :
297 2333 : void RSSAtomParser::ParseState::clearItemFields()
298 : {
299 2333 : title.clear();
300 2333 : subtitle.clear();
301 2333 : content.clear();
302 2333 : author.clear();
303 2333 : urlHref.clear();
304 2333 : urlData.clear();
305 2333 : pubdate.clear();
306 2333 : lastbuilddate.clear();
307 2333 : created.clear();
308 2333 : updated.clear();
309 2333 : date.clear();
310 2333 : guid.clear();
311 2333 : id.clear();
312 2333 : mediaImageURL.clear();
313 2333 : mediaImageWidth = 0;
314 2333 : }
315 :
316 97 : void RSSAtomParser::saveSummary()
317 : {
318 97 : feed->title = state.title;
319 97 : feed->subtitle = state.subtitle;
320 97 : feed->siteURL = state.urlData.isEmpty() ? QUrl(state.urlHref) : QUrl(state.urlData);
321 97 : feed->isPodcast = state.hasPodcastSignals;
322 :
323 97 : state.clearItemFields();
324 97 : }
325 :
326 :
327 18527 : QStringView RSSAtomParser::getTagStackAt(qint32 n)
328 : {
329 18527 : if (state.tagStack.isEmpty() || (state.tagStack.size() - 1) < n) {
330 1 : return QStringView();
331 : }
332 :
333 18526 : return state.tagStack.at(state.tagStack.size() - 1 - n);
334 : }
|