Line data Source code
1 : #include "ParserXMLWorker.h"
2 : #include <QtCore/qtimezone.h>
3 : #include "../utilities/ErrorHandling.h"
4 :
5 86 : ParserXMLWorker::ParserXMLWorker(QObject *parent) :
6 86 : FangObject(parent), feed(nullptr), currentItem(nullptr), isValid(false), inAtomXHTML(false)
7 : {
8 86 : }
9 :
10 172 : ParserXMLWorker::~ParserXMLWorker()
11 : {
12 86 : delete feed;
13 172 : }
14 :
15 60 : void ParserXMLWorker::documentStart()
16 : {
17 : // Make a new feed! Yay!
18 60 : delete feed;
19 60 : feed = new RawFeed();
20 60 : isValid = true;
21 :
22 60 : resetParserVars();
23 60 : }
24 :
25 59 : void ParserXMLWorker::documentEnd()
26 : {
27 59 : if (isValid) {
28 59 : if (feed->items.size() == 0) {
29 : // Edge case: we typically save the summary when we encounter the first item. This
30 : // handles the case where they were no items but we might have a summary.
31 1 : saveSummary();
32 : }
33 59 : emit done(feed);
34 : }
35 :
36 : // If it's not valid, we already emitted a signal.
37 59 : }
38 :
39 59 : void ParserXMLWorker::addXML(QByteArray data)
40 : {
41 59 : if (!isValid) {
42 0 : return;
43 : }
44 :
45 59 : xml.addData(data);
46 :
47 88714 : while (!xml.atEnd()) {
48 : // Grab the next thingie.
49 88655 : xml.readNext();
50 :
51 88655 : if (xml.isStartElement()) {
52 23936 : elementStart();
53 64719 : } else if (xml.isEndElement()) {
54 23936 : elementEnd();
55 40783 : } else if (xml.isCharacters() && !xml.isWhitespace()) {
56 17811 : elementContents();
57 : }
58 : }
59 :
60 : // Standards need to be a bit lax for RSS.
61 59 : if (xml.error() && xml.error() != QXmlStreamReader::PrematureEndOfDocumentError &&
62 0 : xml.error() != QXmlStreamReader::NotWellFormedError) {
63 0 : isValid = false;
64 0 : qWarning() << "XML ERROR:" << xml.lineNumber() << ": " << xml.errorString();
65 0 : emit done(nullptr);
66 : }
67 :
68 : }
69 :
70 :
71 23936 : void ParserXMLWorker::elementStart()
72 : {
73 23936 : QString tagName = xml.name().toString().toLower();
74 :
75 : // Look for start of entries.
76 : //qDebug() << "XML node: " << xml.name().toString() << " " << xml.prefix().toString();
77 23936 : if ((tagName == "item" || tagName == "entry") && !inAtomXHTML) {
78 :
79 1922 : if (urlHref.isEmpty()) {
80 3776 : urlHref = xml.attributes().value("rss:about").toString();
81 : }
82 :
83 1922 : if (numItems == 0) {
84 : // Oh, first item? Assume we've seen the summary then.
85 58 : saveSummary();
86 : }
87 :
88 1922 : currentItem = new RawNews(feed);
89 1922 : numItems++;
90 22756 : } else if ((tagName == "content" || tagName == "summary") &&
91 23498 : xml.attributes().value("type").toString().toLower() == "xhtml") {
92 : // Atom has a crappy feature where you can just stick unescaped xhtml
93 : // into the Atom's DOM. Someone at Google must not believe in SAX
94 : // parsers, I guess?
95 99 : inAtomXHTML = true;
96 21915 : } else if (inAtomXHTML) {
97 : // Build a string of the tag's elements.
98 2802 : QString elements = "";
99 2802 : QXmlStreamAttributes attributes = xml.attributes();
100 4204 : for (QXmlStreamAttribute attribute : attributes) {
101 2804 : elements += " " + attribute.name().toString() + "=\""
102 4206 : + attribute.value().toString() + "\"";
103 1402 : }
104 :
105 : // Mash the tag together.
106 2802 : content += "<" + xml.qualifiedName().toString() + elements + ">";
107 :
108 : // Early exit!
109 2802 : return;
110 2802 : }
111 :
112 21134 : currentTag = tagName;
113 21134 : currentPrefix = xml.prefix().toString().toLower();
114 42268 : hasType = xml.attributes().hasAttribute("type");
115 :
116 23186 : if (currentTag == "link" && urlHref.isEmpty() && xml.attributes().hasAttribute("href")) {
117 : // Used by atom feeds to grab the first link.
118 416 : urlHref = xml.attributes().value("href").toString();
119 : }
120 :
121 : // Add this new tag to our stack. :)
122 21134 : tagStack.push(tagName);
123 23936 : }
124 :
125 23936 : void ParserXMLWorker::elementEnd()
126 : {
127 23936 : if (!inAtomXHTML) {
128 21035 : tagStack.pop(); // Pop our tag stack, we're through with this one!
129 : }
130 :
131 23936 : QString tagName = xml.name().toString().toLower();
132 :
133 23936 : if ((tagName == "item" || tagName == "entry") && !inAtomXHTML) {
134 : //qDebug() << "End element:" << xml.name().toString();
135 1922 : if (currentItem == nullptr) {
136 : // Throw some kinda error, this can't happen.
137 0 : qDebug() << "Current item is null!";
138 0 : qDebug() << "Current title: " << title;
139 0 : qDebug() << "Xml element: " << tagName;
140 : }
141 :
142 : // Figure out which date to use.
143 1922 : QString timestamp;
144 1922 : if (!pubdate.trimmed().isEmpty()) {
145 1679 : timestamp = pubdate;
146 243 : } else if (!lastbuilddate.trimmed().isEmpty()) {
147 0 : timestamp = lastbuilddate;
148 243 : } else if (!created.trimmed().isEmpty()) {
149 15 : timestamp = created;
150 228 : } else if (!date.trimmed().isEmpty()) {
151 69 : timestamp = date;
152 159 : } else if (!updated.trimmed().isEmpty()) {
153 159 : timestamp = updated;
154 : }
155 :
156 : // Determine the GUID.
157 1922 : QString myGuid;
158 1922 : if (!id.trimmed().isEmpty()) {
159 174 : myGuid = id.trimmed();
160 1748 : } else if (!guid.trimmed().isEmpty()) {
161 1314 : myGuid = guid.trimmed();
162 434 : } else if (!urlData.trimmed().isEmpty()) {
163 434 : myGuid = urlData.trimmed();
164 : } else {
165 0 : myGuid = urlHref.trimmed();
166 : }
167 :
168 : // Skip items without a GUID - malformed feed
169 1922 : if (myGuid.isEmpty()) {
170 0 : qWarning() << "ParserXMLWorker: RSS/Atom item missing GUID/URL, skipping item";
171 0 : qWarning() << " Title:" << title;
172 0 : delete currentItem;
173 0 : currentItem = nullptr;
174 :
175 : // Clear all strings for next item
176 0 : author = title = subtitle = content = QString();
177 0 : urlData = urlHref = guid = id = date = updated = timestamp = QString();
178 0 : return;
179 : }
180 :
181 : // Item space.
182 1922 : currentItem->author = author;
183 1922 : currentItem->title = title;
184 1922 : currentItem->description = subtitle;
185 1922 : currentItem->content = content;
186 1922 : currentItem->url = urlData.isEmpty() ? QUrl(urlHref) : QUrl(urlData);
187 1922 : currentItem->timestamp = dateFromFeedString(timestamp);
188 1922 : currentItem->guid = myGuid;
189 :
190 : // Okay, give it up. :(
191 1922 : if (!currentItem->timestamp.isValid()) {
192 0 : qDebug() << "Time string: " << timestamp;
193 0 : qDebug() << "invalid date!";
194 : }
195 :
196 :
197 1922 : feed->items.append(currentItem);
198 1922 : currentItem = nullptr;
199 :
200 : // Clear all strings.
201 1922 : title = "";
202 1922 : urlHref = "";
203 1922 : urlData = "";
204 1922 : subtitle = "";
205 1922 : pubdate = "";
206 1922 : lastbuilddate = "";
207 1922 : created = "";
208 1922 : date = "";
209 1922 : updated = "";
210 1922 : author = "";
211 1922 : content = "";
212 1922 : guid = "";
213 1922 : id = "";
214 23936 : } else if (tagName == "content" || tagName == "summary") {
215 : // Just accept that this is the end of one of these:
216 : // <contents type="xhtml">
217 742 : if (inAtomXHTML) {
218 99 : inAtomXHTML = false;
219 99 : tagStack.pop(); // We didn't do this earlier, you see.
220 : }
221 : }
222 :
223 23936 : if (inAtomXHTML) {
224 : // SLORG we need to add this tag to the contents.
225 :
226 : // TODO: Is there a better way to do this?!
227 2802 : content += "</" + xml.qualifiedName().toString() + ">";
228 : }
229 23936 : }
230 :
231 17811 : void ParserXMLWorker::elementContents()
232 : {
233 17811 : if (inAtomXHTML) {
234 : // Atom sucks!
235 1860 : content += xml.text().toString();
236 :
237 1860 : return; // Early exit.
238 : }
239 :
240 15951 : QString parentTag = getTagStackAt(1);
241 15951 : if (parentTag == "item" || parentTag == "entry") {
242 : //
243 : // Inside a news item.
244 : //
245 :
246 14613 : if (currentTag == "title" && currentPrefix == "") {
247 1922 : title += xml.text().toString();
248 12691 : } else if (currentTag == "link" && currentPrefix == "") {
249 1748 : urlData += xml.text().toString();
250 10943 : } else if (currentTag == "description" || currentTag == "summary") {
251 1939 : subtitle += xml.text().toString();
252 9004 : } else if (currentTag == "name") {
253 0 : author += xml.text().toString();
254 9004 : } else if (currentTag == "pubdate") {
255 1679 : pubdate += xml.text().toString();
256 7325 : } else if (currentTag == "lastbuilddate") {
257 0 : lastbuilddate += xml.text().toString();
258 7325 : } else if (currentTag == "created") {
259 15 : created += xml.text().toString();
260 7310 : } else if (currentTag == "updated") {
261 184 : updated += xml.text().toString();
262 7126 : } else if (currentTag == "date") {
263 154 : date += xml.text().toString();
264 6972 : } else if (currentTag == "guid") {
265 1314 : guid += xml.text().toString();
266 5658 : } else if (currentTag == "id") {
267 174 : id += xml.text().toString();
268 5689 : } else if ((currentTag == "encoded" && currentPrefix == "content")
269 5689 : || (currentTag == "content" && hasType)) {
270 260 : content += xml.text().toString();
271 : }
272 1338 : } else if (parentTag == "channel" || parentTag == "feed") {
273 : //
274 : // Top level items.
275 : //
276 :
277 401 : if (currentTag == "title" && currentPrefix == "") {
278 59 : title += xml.text().toString();
279 342 : } else if (currentTag == "link" && currentPrefix == "") {
280 52 : urlData += xml.text().toString();
281 290 : } else if (currentTag == "description" || currentTag == "summary") {
282 45 : subtitle += xml.text().toString();
283 : }
284 : }
285 15951 : }
286 :
287 60 : void ParserXMLWorker::resetParserVars()
288 : {
289 60 : xml.clear();
290 :
291 60 : numItems = 0;
292 60 : currentTag = "";
293 60 : currentPrefix = "";
294 60 : urlHref = "";
295 60 : title = "";
296 60 : subtitle = "";
297 60 : content = "";
298 60 : pubdate = "";
299 60 : lastbuilddate = "";
300 60 : created = "";
301 60 : updated = "";
302 60 : date = "";
303 60 : author = "";
304 60 : guid = "";
305 60 : id = "";
306 60 : hasType = false;
307 60 : inAtomXHTML = false;
308 60 : tagStack.clear();
309 60 : }
310 :
311 59 : void ParserXMLWorker::saveSummary()
312 : {
313 : // Global space.
314 59 : feed->title = title;
315 59 : feed->subtitle = subtitle;
316 59 : feed->siteURL = urlData.isEmpty() ? QUrl(urlHref) : QUrl(urlData);
317 :
318 : // Clear all local strings.
319 59 : title = "";
320 59 : urlHref = "";
321 59 : urlData = "";
322 59 : subtitle = "";
323 59 : pubdate = "";
324 59 : lastbuilddate = "";
325 59 : updated = "";
326 59 : date = "";
327 59 : author = "";
328 59 : content = "";
329 59 : guid = "";
330 59 : id = "";
331 59 : }
332 :
333 :
334 1922 : QDateTime ParserXMLWorker::dateFromFeedString(const QString& _timestamp)
335 : {
336 1922 : QDateTime ret; // Defaults to invalid timestamp.
337 :
338 : // Come up with a few versions of the time stamp.
339 1922 : QString timestamp = _timestamp.trimmed();
340 1922 : yearFix(timestamp); //IMPORTANT: Must be done *before* weekday name is shaved.
341 1922 : shaveWeekdayName(timestamp);
342 1922 : monthMassager(timestamp);
343 : QString timestamps[] = {
344 : timestamp,
345 3844 : timestamp.left(timestamp.lastIndexOf(" ")).trimmed(),
346 3844 : timestamp.left(timestamp.lastIndexOf(".")).trimmed(),
347 3844 : timestamp.left(timestamp.lastIndexOf("-")).trimmed(),
348 3844 : timestamp.left(timestamp.lastIndexOf("+")).trimmed(),
349 :
350 : "" // must be last
351 23064 : };
352 :
353 : // Date time. Comes in many (ugh) different formats.
354 : const QString dateFormats[] = {
355 : // Most typical RSS format
356 : // Example: Tue, 02 Jul 2013 01:01:24 +0000 or Sun, 13 Oct 2013 19:15:29 PST
357 : // But Fang shaves off weekday names (see above), because they're useless and are often screwed up.
358 : "dd MMM yyyy hh:mm:ss",
359 :
360 : // One-digit minutes (yes, this happens.)
361 : "dd MMM yyyy hh:m:ss",
362 :
363 : // Same as above, but with full months.
364 : "dd MMMM yyyy hh:mm:ss",
365 :
366 : // Full month, one digit minutes.
367 : "dd MMMM yyyy hh:m:ss",
368 :
369 : // Also same as above, but with potentially single-digit days. (Used by "The Hindu".)
370 : "d MMM yyyy hh:mm:ss",
371 :
372 : // RFC 3339, normally used by Atom.
373 : // Example: 2013-08-07T16:47:54Z
374 : "yyyy-MM-ddThh:mm:ssZ",
375 :
376 : // Variant of the above without the trailing Z.
377 : // Example: 2012-05-30T19:46:42
378 : "yyyy-MM-ddThh:mm:ss",
379 :
380 : // Variant of the above without seconds OR a trailing Z.
381 : // Example: 2012-05-30T19:46
382 : "yyyy-MM-ddThh:mm",
383 :
384 : // Format used by some Chinese site.
385 : // Example: 2014-02-27 08:26:16.995
386 : "yyyy-MM-dd hh:mm:ss",
387 :
388 : // "Lokmat" uses this custom format. I provide a single-spaced version for sanity's sake.
389 : // Example: 25-02-2014 01:08:10
390 : "dd-MM-yyyy hh:mm:ss",
391 : "dd-MM-yyyy hh:mm:ss",
392 :
393 :
394 : "" // must be last!
395 26908 : };
396 :
397 : // Iterate over date formats.
398 1922 : int i = 0;
399 7019 : while (!ret.isValid() && !dateFormats[i].isEmpty()) {
400 5097 : const QString& format = dateFormats[i];
401 :
402 : // Try each format against each possible manipulated timestamp.
403 5097 : int j = 0;
404 24946 : while (!ret.isValid() && !timestamps[j].isEmpty()) {
405 19849 : QString& ts = timestamps[j];
406 19849 : ret = QDateTime::fromString(ts, format);
407 :
408 19849 : j++;
409 : }
410 :
411 5097 : i++;
412 : }
413 :
414 : // Check if there's a time-based adjustment and/or timezone.
415 : // For now we only look for time identifiers in the format of -hhmm or +hhmm
416 : //
417 : // TODO: Three-letter time zones. (TLAs like GMT, PST, etc.)
418 : //
419 1922 : int lastPlus = timestamp.lastIndexOf("+");
420 1922 : int lastMinus = timestamp.lastIndexOf("-");
421 1922 : if (lastPlus > 3 || lastMinus > 3) {
422 : // We have a plus or a minus.
423 1427 : int signPos = lastPlus > 3 ? lastPlus : lastMinus;
424 1427 : QString sAdjustment = timestamp.right(timestamp.length() - signPos);
425 1427 : sAdjustment = sAdjustment.trimmed();
426 :
427 : // Check for an hour/minute adjustment, in the format of -hhmm or +hhmm
428 : // OR in the format of -hh:mm or +hh:mm
429 2627 : if ((sAdjustment.length() == 5 || sAdjustment.length() == 6) &&
430 2627 : (sAdjustment.startsWith("+") || sAdjustment.startsWith("-"))) {
431 1200 : int adjustment = 0; // Adjustment in minutes.
432 1200 : bool containsCol = sAdjustment.contains(':');
433 1200 : bool isNum = false;
434 1200 : int hours = 0;
435 1200 : int minutes = 0;
436 :
437 1200 : QString sNumber = sAdjustment.right(containsCol ? 5 : 4); // Skip + or -
438 : // YES! We've got an adjustment!
439 1200 : hours = sNumber.left(2).toInt(&isNum);
440 1200 : if (isNum)
441 1200 : minutes = sNumber.right(2).toInt(&isNum);
442 :
443 : // Looks like we're good!
444 1200 : if (isNum) {
445 : // Condense down to minutes.
446 1200 : minutes += (hours * 60);
447 1200 : adjustment = sAdjustment.startsWith("-") ? minutes : -minutes;
448 :
449 : // Add in our adjustment if we need it.
450 1200 : ret = ret.addSecs(adjustment * 60 /* seconds */);
451 : }
452 1200 : }
453 1427 : }
454 :
455 : // All times are (supposedly) in UTC.
456 1922 : ret.setTimeZone(QTimeZone::UTC);
457 :
458 3844 : return ret;
459 40362 : }
460 :
461 :
462 1922 : void ParserXMLWorker::yearFix(QString& timestamp)
463 : {
464 : // If the timestamp is something like this:
465 : // Tue, 02 Jul 13 [etc]
466 : // We want to make it something like this:
467 : // Tue, 02 Jul 2013 [etc]
468 1922 : if (timestamp.length() == 0 || !timestamp[0].isLetter()) {
469 331 : return; // Early exit.
470 : }
471 :
472 1591 : bool seenWeekday = false;
473 1591 : bool seenDay = false;
474 1591 : bool seenMonth = false;
475 1591 : bool seenYear = false;
476 1591 : bool hitSpace = true; // This controls whether or not we examine the character.
477 1591 : int charsInYear = 0;
478 27047 : for (int i = 0; i < timestamp.length(); i++) {
479 27047 : if (hitSpace && (timestamp[i].isLetter() || timestamp[i] == ',')) {
480 3182 : hitSpace = false; // reset
481 :
482 3182 : if (!seenWeekday) {
483 1591 : seenWeekday = true;
484 1591 : } else if (!seenMonth) {
485 1591 : seenMonth = true;
486 : }
487 23865 : } else if (hitSpace && (timestamp[i].isDigit())) {
488 3182 : hitSpace = false; // reset
489 :
490 3182 : if (!seenDay) {
491 1591 : seenDay = true;
492 1591 : } else if (!seenYear) {
493 1591 : seenYear = true;
494 : }
495 20683 : } else if (timestamp[i].isSpace()) {
496 6364 : if (seenYear) {
497 : // Here's where we find out if we can leave yet.
498 1591 : if (charsInYear != 2) {
499 1575 : break; // Early exit!
500 : } else {
501 : // Sigh... okay, now we have to back up and insert a "20".
502 : // Currently we're here: [Tue, 02 Jul 13 ]
503 16 : timestamp = timestamp.insert(i - 2, "20");
504 16 : return; // YAY! WE DID IT!
505 : }
506 : } else {
507 4773 : hitSpace = true;
508 : }
509 : }
510 :
511 25456 : if (seenYear) {
512 6332 : ++charsInYear;
513 : }
514 : }
515 : }
516 :
517 :
518 1922 : void ParserXMLWorker::shaveWeekdayName(QString& timestamp)
519 : {
520 : // NOTE:
521 : // By the time we've reached this method, the timestamp has
522 : // already been trimmed, and we've made sure the year has four digits.
523 :
524 1922 : int comma = timestamp.indexOf(',');
525 1922 : if (comma < 0) {
526 331 : return; // Early exit.
527 : }
528 :
529 : // Remove up to and including the comma itself.
530 1591 : timestamp = timestamp.remove(0, comma + 1).trimmed();
531 : }
532 :
533 1922 : void ParserXMLWorker::monthMassager(QString& timestamp)
534 : {
535 : // Add new ones as they're encountered.
536 1922 : timestamp = timestamp.replace("Sept ", "Sep ");
537 1922 : }
538 :
539 :
540 15951 : QString ParserXMLWorker::getTagStackAt(qint32 n)
541 : {
542 : // n is from 0..size - 1
543 15951 : if (tagStack.isEmpty() || (tagStack.size() - 1) < n)
544 0 : return "";
545 :
546 15951 : return tagStack.at(tagStack.size() - 1 - n);
547 : }
|