Line data Source code
1 : #include "FeedDateParser.h"
2 : #include <QMap>
3 : #include <QtCore/qtimezone.h>
4 :
5 2235 : QDateTime FeedDateParser::dateFromFeedString(const QString& _timestamp)
6 : {
7 2235 : QDateTime ret; // Defaults to invalid timestamp.
8 :
9 : // Come up with a few versions of the time stamp.
10 2235 : QString timestamp = _timestamp.trimmed();
11 2235 : yearFix(timestamp); // IMPORTANT: Must be done *before* weekday name is shaved.
12 2235 : shaveWeekdayName(timestamp);
13 2235 : monthMassager(timestamp);
14 : QString timestamps[] = {
15 : timestamp,
16 4470 : timestamp.left(timestamp.lastIndexOf(" ")).trimmed(),
17 4470 : timestamp.left(timestamp.lastIndexOf(".")).trimmed(),
18 4470 : timestamp.left(timestamp.lastIndexOf("-")).trimmed(),
19 4470 : timestamp.left(timestamp.lastIndexOf("+")).trimmed(),
20 :
21 : "" // must be last
22 26820 : };
23 :
24 : // Date time. Comes in many (ugh) different formats.
25 : const QString dateFormats[] = {
26 : // Most typical RSS format
27 : // Example: Tue, 02 Jul 2013 01:01:24 +0000 or Sun, 13 Oct 2013 19:15:29 PST
28 : // But Fang shaves off weekday names (see above), because they're useless and are often screwed up.
29 : "dd MMM yyyy hh:mm:ss",
30 :
31 : // One-digit minutes (yes, this happens.)
32 : "dd MMM yyyy hh:m:ss",
33 :
34 : // Same as above, but with full months.
35 : "dd MMMM yyyy hh:mm:ss",
36 :
37 : // Full month, one digit minutes.
38 : "dd MMMM yyyy hh:m:ss",
39 :
40 : // Also same as above, but with potentially single-digit days. (Used by "The Hindu".)
41 : "d MMM yyyy hh:mm:ss",
42 :
43 : // RFC 3339, normally used by Atom.
44 : // Example: 2013-08-07T16:47:54Z
45 : "yyyy-MM-ddThh:mm:ssZ",
46 :
47 : // Variant of the above without the trailing Z.
48 : // Example: 2012-05-30T19:46:42
49 : "yyyy-MM-ddThh:mm:ss",
50 :
51 : // Variant of the above without seconds OR a trailing Z.
52 : // Example: 2012-05-30T19:46
53 : "yyyy-MM-ddThh:mm",
54 :
55 : // Format used by some Chinese site.
56 : // Example: 2014-02-27 08:26:16.995
57 : "yyyy-MM-dd hh:mm:ss",
58 :
59 : // "Lokmat" uses this custom format. I provide a single-spaced version for sanity's sake.
60 : // Example: 25-02-2014 01:08:10
61 : "dd-MM-yyyy hh:mm:ss",
62 : "dd-MM-yyyy hh:mm:ss",
63 :
64 :
65 : "" // must be last!
66 31290 : };
67 :
68 : // Iterate over date formats.
69 2235 : int i = 0;
70 7680 : while (!ret.isValid() && !dateFormats[i].isEmpty()) {
71 5445 : const QString& format = dateFormats[i];
72 :
73 : // Try each format against each possible manipulated timestamp.
74 5445 : int j = 0;
75 26038 : while (!ret.isValid() && !timestamps[j].isEmpty()) {
76 20593 : QString& ts = timestamps[j];
77 20593 : ret = QDateTime::fromString(ts, format);
78 :
79 20593 : j++;
80 : }
81 :
82 5445 : i++;
83 : }
84 :
85 : // Check if there's a time-based adjustment and/or timezone.
86 : // First try numeric offsets in the format of -hhmm, +hhmm, -hh:mm, or +hh:mm.
87 2235 : int lastPlus = timestamp.lastIndexOf("+");
88 2235 : int lastMinus = timestamp.lastIndexOf("-");
89 2235 : if (lastPlus > 3 || lastMinus > 3) {
90 : // We have a plus or a minus.
91 1568 : int signPos = lastPlus > 3 ? lastPlus : lastMinus;
92 1568 : QString sAdjustment = timestamp.right(timestamp.length() - signPos);
93 1568 : sAdjustment = sAdjustment.trimmed();
94 :
95 : // Check for an hour/minute adjustment, in the format of -hhmm or +hhmm
96 : // OR in the format of -hh:mm or +hh:mm
97 2904 : if ((sAdjustment.length() == 5 || sAdjustment.length() == 6) &&
98 2904 : (sAdjustment.startsWith("+") || sAdjustment.startsWith("-"))) {
99 1336 : bool containsCol = sAdjustment.contains(':');
100 1336 : bool isNum = false;
101 1336 : int hours = 0;
102 1336 : int minutes = 0;
103 :
104 1336 : QString sNumber = sAdjustment.right(containsCol ? 5 : 4); // Skip + or -
105 1336 : hours = sNumber.left(2).toInt(&isNum);
106 1336 : if (isNum) {
107 1336 : minutes = sNumber.right(2).toInt(&isNum);
108 : }
109 :
110 1336 : if (isNum) {
111 : // Condense down to minutes.
112 1336 : minutes += (hours * 60);
113 1336 : int adjustment = sAdjustment.startsWith("-") ? minutes : -minutes;
114 :
115 : // Add in our adjustment if we need it.
116 1336 : ret = ret.addSecs(adjustment * 60 /* seconds */);
117 : }
118 1336 : }
119 1568 : }
120 :
121 : // Three-letter timezone abbreviations (UTC offset in minutes).
122 : static const QMap<QString, int> tzOffsets = {
123 0 : {"GMT", 0}, {"UTC", 0},
124 0 : {"EST", -300}, {"EDT", -240},
125 0 : {"CST", -360}, {"CDT", -300},
126 0 : {"MST", -420}, {"MDT", -360},
127 0 : {"PST", -480}, {"PDT", -420}
128 2246 : };
129 :
130 : // Check if the timestamp ends with a known abbreviation.
131 2235 : QString lastWord = timestamp.section(' ', -1).trimmed().toUpper();
132 2235 : if (tzOffsets.contains(lastWord)) {
133 666 : int offsetMinutes = tzOffsets.value(lastWord);
134 666 : ret = ret.addSecs(-offsetMinutes * 60);
135 : }
136 :
137 : // All times are (supposedly) in UTC.
138 2235 : ret.setTimeZone(QTimeZone::UTC);
139 :
140 4470 : return ret;
141 49171 : }
142 :
143 :
144 2235 : void FeedDateParser::yearFix(QString& timestamp)
145 : {
146 : // If the timestamp is something like this:
147 : // Tue, 02 Jul 13 [etc]
148 : // We want to make it something like this:
149 : // Tue, 02 Jul 2013 [etc]
150 2235 : if (timestamp.length() == 0 || !timestamp[0].isLetter()) {
151 337 : return; // Early exit.
152 : }
153 :
154 1898 : bool seenWeekday = false;
155 1898 : bool seenDay = false;
156 1898 : bool seenMonth = false;
157 1898 : bool seenYear = false;
158 1898 : bool hitSpace = true; // This controls whether or not we examine the character.
159 1898 : int charsInYear = 0;
160 32266 : for (int i = 0; i < timestamp.length(); i++) {
161 32266 : if (hitSpace && (timestamp[i].isLetter() || timestamp[i] == ',')) {
162 3796 : hitSpace = false; // reset
163 :
164 3796 : if (!seenWeekday) {
165 1898 : seenWeekday = true;
166 1898 : } else if (!seenMonth) {
167 1898 : seenMonth = true;
168 : }
169 28470 : } else if (hitSpace && (timestamp[i].isDigit())) {
170 3796 : hitSpace = false; // reset
171 :
172 3796 : if (!seenDay) {
173 1898 : seenDay = true;
174 1898 : } else if (!seenYear) {
175 1898 : seenYear = true;
176 : }
177 24674 : } else if (timestamp[i].isSpace()) {
178 7592 : if (seenYear) {
179 : // Here's where we find out if we can leave yet.
180 1898 : if (charsInYear != 2) {
181 1882 : break; // Early exit!
182 : } else {
183 : // Sigh... okay, now we have to back up and insert a "20".
184 : // Currently we're here: [Tue, 02 Jul 13 ]
185 16 : timestamp = timestamp.insert(i - 2, "20");
186 16 : return; // YAY! WE DID IT!
187 : }
188 : } else {
189 5694 : hitSpace = true;
190 : }
191 : }
192 :
193 30368 : if (seenYear) {
194 7560 : ++charsInYear;
195 : }
196 : }
197 : }
198 :
199 :
200 2235 : void FeedDateParser::shaveWeekdayName(QString& timestamp)
201 : {
202 : // NOTE:
203 : // By the time we've reached this method, the timestamp has
204 : // already been trimmed, and we've made sure the year has four digits.
205 :
206 2235 : int comma = timestamp.indexOf(',');
207 2235 : if (comma < 0) {
208 337 : return; // Early exit.
209 : }
210 :
211 : // Remove up to and including the comma itself.
212 1898 : timestamp = timestamp.remove(0, comma + 1).trimmed();
213 : }
214 :
215 2235 : void FeedDateParser::monthMassager(QString& timestamp)
216 : {
217 : // Add new ones as they're encountered.
218 2235 : timestamp = timestamp.replace("Sept ", "Sep ");
219 2235 : }
|