Line data Source code
1 : #include "RawFeedRewriter.h"
2 :
3 : #include <QXmlStreamReader>
4 : #include <QXmlStreamWriter>
5 : #include <QString>
6 : #include <QStack>
7 : #include <QDebug>
8 :
9 : #include "NetworkUtilities.h"
10 :
11 : // Image width max.
12 : #define MAX_ELEMENT_WIDTH 400
13 :
14 : // Strings.
15 : #define S_WIDTH "width"
16 : #define S_HEIGHT "height"
17 : #define S_SRC "src"
18 : #define S_IMG "img"
19 : #define S_HREF "href"
20 : #define S_ID "id"
21 :
22 27 : RawFeedRewriter::RawFeedRewriter(QObject *parent) :
23 : FangObject(parent),
24 27 : newsList(nullptr),
25 27 : webPageGrabber(false),
26 27 : imageGrabber(),
27 54 : intID(0)
28 : {
29 27 : connect(&imageGrabber, &ImageGrabber::finished, this, &RawFeedRewriter::onImageGrabberFinished);
30 :
31 27 : tagsToRemove << "script" // Javascript
32 54 : << "title" // Titles WTF?
33 54 : << "head" // Don't need head
34 54 : << "style" // Custom styles.
35 54 : << "iframe" // Iframes!
36 54 : << "object" // Plugins!
37 54 : << "embed" // Other plugins!
38 27 : << "hr"; // No horizontals allowed; they're ugly.
39 :
40 27 : classesToRemove << "feedflare" // Feedburger's 37 pieces of flare
41 54 : << "mf-viral" // Motherfucking viral?
42 27 : << "service-links-stumbleupon"; // StubbleUponYourFace
43 :
44 27 : shareButtonURLs << "twitter.com/home?status"
45 54 : << "plus.google.com/shar"
46 54 : << "facebook.com/shar"
47 54 : << "feedsportal.com/"
48 54 : << "api.tweetmeme.com/"
49 54 : << "stumbleupon.com/submit"
50 27 : << "share.feedsportal.com/share";
51 :
52 27 : containerTags << "p"
53 54 : << "div"
54 54 : << "span"
55 27 : << "pre";
56 27 : }
57 :
58 27 : void RawFeedRewriter::rewrite(QList<RawNews *> *newsList)
59 : {
60 : // Save our news list!</protestChant>
61 27 : this->newsList = newsList;
62 :
63 27 : QSet<QUrl> imageURLs;
64 27 : idsToDelete.clear();
65 27 : intID = 0;
66 :
67 : // Iterate over all the news we have.
68 54 : for (RawNews* news : *newsList) {
69 27 : if (news->content.size()) {
70 0 : news->content = rewriteFirstPass(news->content, imageURLs);
71 : }
72 :
73 27 : if (news->description.size()) {
74 26 : news->description = rewriteFirstPass(news->description, imageURLs);
75 : }
76 : }
77 :
78 : // No images? We're done, yay!
79 27 : if (imageURLs.size() == 0) {
80 : // Gotta do this, g.
81 24 : rewriteAllSecondPass();
82 24 : postProcess();
83 :
84 24 : emit finished();
85 :
86 24 : return;
87 : }
88 :
89 : // Do the whole image resizing thang.
90 3 : imageGrabber.fetchUrls(imageURLs.values());
91 27 : }
92 :
93 258 : bool RawFeedRewriter::isHTMLEmpty(QString html)
94 : {
95 258 : html.replace(" ", "");
96 258 : html.replace("\t", "");
97 258 : html.replace("\n", "");
98 :
99 258 : return html.size() == 0;
100 : }
101 :
102 24 : bool RawFeedRewriter::isShareURL(const QString &url)
103 : {
104 187 : for (QString shareURL : shareButtonURLs) {
105 164 : if (url.contains(shareURL, Qt::CaseInsensitive)) {
106 1 : return true;
107 : }
108 164 : }
109 :
110 23 : return false;
111 : }
112 :
113 164 : QString RawFeedRewriter::intToID(int id)
114 : {
115 164 : return "FangID_" + QString::number(id);
116 : }
117 :
118 26 : QString RawFeedRewriter::rewriteFirstPass(const QString &document, QSet<QUrl> &imageURLs)
119 : {
120 : // We use TidyLib via WebPageGrabber to convert the (potentially crappy) HTML into proper
121 : // XHTML. This will add a doctype and other unwanted headers/footers, so we strip those
122 : // out in a separate post-processing method. You'll see.
123 26 : QString* doc = webPageGrabber.load("<html><body>" + document + "</body></html>");
124 26 : if (doc == nullptr) {
125 0 : qDebug() << "Error loading HTML document";
126 :
127 0 : return "";
128 : }
129 :
130 : // Swap out non-breaking spaces here since QXmlStreamReader doesn't handle them well.
131 26 : doc->replace(" ", " ", Qt::CaseInsensitive);
132 :
133 : // We're going to count the number of tags to determine if this is a real HTML document,
134 : // or a text document.
135 26 : int tagCount = 0;
136 :
137 26 : QXmlStreamReader xml;
138 26 : xml.addData(*doc);
139 :
140 26 : QString output;
141 26 : QXmlStreamWriter writer(&output);
142 26 : writer.setAutoFormatting(false);
143 :
144 : // If we're skipping elements, this is >= 1
145 26 : int skip = 0;
146 :
147 : // Current stack.
148 26 : QStack<DOMNode> stack;
149 :
150 : // Was the last node text?
151 26 : bool lastWasText = false;
152 :
153 938 : while (!xml.atEnd()) {
154 : // Grab the next thingie.
155 912 : xml.readNext();
156 :
157 912 : if (xml.isStartElement()) {
158 : // Start
159 247 : tagCount++;
160 :
161 247 : if (0 == skip) {
162 194 : QString tagName = xml.name().toString().toLower();
163 388 : QString classValue = xml.attributes().value("class").toString();
164 388 : QString href = xml.attributes().value(S_HREF).toString();
165 :
166 358 : if (tagsToRemove.contains(tagName) ||
167 328 : classesToRemove.contains(classValue) || // Delete known bad classes
168 551 : (tagName == "a" && isShareURL(href)) || // Delete share links
169 363 : (tagName == "br" && !lastWasText)) { // Delete br's that weren't preceeded by text.
170 : // Skip it good!
171 36 : skip = 1;
172 : } else {
173 : // Write the tag.
174 158 : writer.writeStartElement(tagName);
175 :
176 158 : intID++;
177 316 : writer.writeAttribute(S_ID, intToID(intID));
178 :
179 : // If there's a parent node, add a child.
180 158 : if (stack.size()) {
181 132 : stack.top().numChildren++;
182 : }
183 :
184 : // Push it.
185 158 : stack.push(DOMNode(tagName, intID));
186 :
187 : // Anchor tags.
188 181 : if (tagName == "a" && xml.attributes().hasAttribute(S_HREF)) {
189 69 : writer.writeAttribute(S_HREF, xml.attributes().value(S_HREF).toString());
190 : }
191 :
192 : // Image tags.
193 167 : if (tagName == S_IMG && xml.attributes().hasAttribute(S_SRC)) {
194 18 : QString imgSrc = NetworkUtilities::urlFixup(xml.attributes().value(S_SRC).toString());
195 18 : writer.writeAttribute(S_SRC, imgSrc);
196 :
197 18 : QString sWidth = xml.attributes().value(S_WIDTH).toString();
198 18 : QString sHeight = xml.attributes().value(S_HEIGHT).toString();
199 :
200 : bool widthOK, heightOK;
201 9 : int width = sWidth.toInt(&widthOK);
202 9 : int height = sHeight.toInt(&heightOK);
203 :
204 9 : if (widthOK && heightOK) {
205 6 : if (width < 3 || height < 3) {
206 : // Delete tiny images.
207 1 : idsToDelete << intToID(intID);
208 : } else {
209 : // Resize image if needed.
210 : int newWidth, newHeight;
211 5 : imageResize(width, height, &newWidth, &newHeight);
212 10 : writer.writeAttribute(S_WIDTH, QString::number(newWidth));
213 10 : writer.writeAttribute(S_HEIGHT, QString::number(newHeight));
214 : }
215 6 : } else {
216 : // Dammit, we're gonna have to fetch this image!
217 3 : imageURLs << imgSrc;
218 : }
219 9 : }
220 : }
221 :
222 194 : lastWasText = false;
223 194 : } else {
224 53 : skip++;
225 : }
226 665 : } else if (xml.isEndElement()) {
227 247 : QString tagName = xml.name().toString().toLower();
228 :
229 : // End
230 247 : if (0 == skip) {
231 158 : writer.writeEndElement();
232 :
233 : // Pop our node and investigate.
234 158 : DOMNode dom = stack.pop();
235 :
236 : // If it's a container and we didn't write any text, then delete this tag in the
237 : // second pass.
238 158 : if (containerTags.contains(tagName) && dom.nonEmptyTextCount == 0 && dom.numChildren == 0) {
239 : //
240 : // This doesn't work -- at the very least the IDs are wrong. We need to
241 : // employ a stack here.
242 : //
243 5 : idsToDelete << intToID(dom.intID);
244 : }
245 :
246 158 : lastWasText = false;
247 158 : } else {
248 89 : skip--;
249 : }
250 665 : } else if (xml.isCharacters() && 0 == skip) {
251 : // Text
252 258 : QString text = xml.text().toString();
253 258 : bool isEmpty = isHTMLEmpty(text);
254 :
255 : // Don't allow pure empty tags, though a single space is ok.
256 258 : if (!isEmpty || text == " ") {
257 88 : bool addSpaceStart = text.startsWith('\n');
258 88 : bool addSpaceEnd = text.endsWith('\n');
259 :
260 : // Text can start or end with a newline -- delete 'em.
261 88 : removeNewlinesBothSides(text);
262 :
263 : // Add back extra spaces so text doesn'truntogether.
264 88 : if (addSpaceStart) {
265 10 : text = ' ' + text;
266 : }
267 :
268 88 : if (addSpaceEnd) {
269 7 : text = text + ' ';
270 : }
271 :
272 : // Write the text!
273 88 : writer.writeCharacters(text);
274 :
275 88 : if (!isEmpty) {
276 81 : stack.top().nonEmptyTextCount++;
277 : }
278 :
279 88 : lastWasText = true;
280 : }
281 418 : } else if (xml.isEntityReference() && 0 == skip) {
282 : // Entity
283 0 : QString entity = xml.name().toString();
284 0 : writer.writeEntityReference(entity);
285 160 : } else if (xml.isStartDocument()) {
286 : // Doc start
287 26 : writer.writeStartDocument("1.0");
288 134 : } else if (xml.isEndDocument()) {
289 : // Doc end
290 26 : writer.writeEndElement();
291 : }
292 : }
293 :
294 26 : if (xml.hasError()) {
295 0 : qDebug() << "Error reading XML: " << xml.errorString();
296 : }
297 :
298 26 : if (writer.hasError()) {
299 0 : qDebug() << "QXmlStreamWriter had an error of some kind.";
300 : }
301 :
302 :
303 30 : if (tagCount <= 5 && output !=
304 30 : "<?xml version=\"1.0\"?><html id=\"FangID_1\"><body id=\"FangID_2\"/></html>") {
305 : // Turns out we're not dealing with an HTML document: there's not enough tags, and it's
306 : // not an empty document (which can be caused by bad HTML.)
307 : // Ditch the Tidy'd doc and rewrite as plain text from the original.
308 3 : return rewriteTextOnlyNews(document);
309 : }
310 :
311 : // Return new document.
312 23 : return output;
313 26 : }
314 :
315 27 : void RawFeedRewriter::rewriteAllSecondPass()
316 : {
317 : // Iterate over all the news... again!
318 54 : for (RawNews* news : *newsList) {
319 27 : if (news->content.size()) {
320 0 : news->content = rewriteSecondPass(news->content);
321 : }
322 :
323 27 : if (news->description.size()) {
324 26 : news->description = rewriteSecondPass(news->description);
325 : }
326 : }
327 27 : }
328 :
329 26 : QString RawFeedRewriter::rewriteSecondPass(QString &docString)
330 : {
331 : // If it was a text-only document, we've prepended it with an ASCII beep. All we have to do
332 : // here is remove the beep and return it.
333 26 : if (docString.startsWith('\07')) {
334 3 : return docString.mid(1);
335 : }
336 :
337 23 : QXmlStreamReader xml;
338 23 : xml.addData(docString);
339 :
340 23 : QString output;
341 23 : QXmlStreamWriter writer(&output);
342 23 : writer.setAutoFormatting(false);
343 23 : int skip = 0; // Skip stack.
344 23 : QString lastTag = "";
345 :
346 458 : while (!xml.atEnd()) {
347 : // Grab the next thingie.
348 435 : xml.readNext();
349 :
350 435 : if (xml.isStartElement()) {
351 152 : if (0 == skip) {
352 : // Start
353 152 : QString tagName = xml.name().toString().toLower();
354 304 : QString id = xml.attributes().value(S_ID).toString();
355 :
356 152 : if (idsToDelete.contains(id)) {
357 : // We need to delete this tag! Skip it.
358 6 : skip = 1;
359 146 : } else if (tagName == S_IMG) {
360 16 : QString url = xml.attributes().value(S_SRC).toString();
361 :
362 8 : int width = 0;
363 8 : int height = 0;
364 :
365 : // We got an image.
366 34 : if (xml.attributes().hasAttribute(S_WIDTH) &&
367 18 : xml.attributes().hasAttribute(S_HEIGHT)) {
368 : // Already have attributes? Cool.
369 10 : width = xml.attributes().value(S_WIDTH).toInt();
370 10 : height = xml.attributes().value(S_HEIGHT).toInt();
371 : } else {
372 3 : QImage image = imageGrabber.getResults()->value(url);
373 3 : if (!image.isNull()) {
374 : // Resize that baby, yeah!
375 2 : imageResize(image.width(), image.height(), &width, &height);
376 : }
377 3 : }
378 :
379 8 : if (width > 2 && height > 2) {
380 : // Okay, we got a good image and it's not a tracking pixel. Satisfaction!
381 7 : writer.writeStartElement(tagName);
382 14 : writer.writeAttribute(S_SRC, url);
383 14 : writer.writeAttribute(S_WIDTH, QString::number(width));
384 14 : writer.writeAttribute(S_HEIGHT, QString::number(height));
385 7 : writer.writeAttribute("align", "left"); // Always left-align.
386 :
387 7 : lastTag = tagName;
388 : } else {
389 : // Bad image! Skip!
390 1 : skip = 1;
391 : }
392 8 : } else {
393 : // Write the tag and all attributes (except for ID)
394 138 : writer.writeStartElement(tagName);
395 299 : for (QXmlStreamAttribute attribute : xml.attributes()) {
396 161 : if (attribute.name().toString() != S_ID) {
397 23 : writer.writeAttribute(attribute);
398 : }
399 299 : }
400 :
401 138 : lastTag = tagName;
402 : }
403 152 : } else {
404 0 : skip++;
405 : }
406 283 : } else if (xml.isEndElement()) {
407 : // End
408 152 : if (0 == skip) {
409 145 : writer.writeEndElement();
410 : } else {
411 7 : skip--;
412 : }
413 131 : } else if (xml.isCharacters() && 0 == skip) {
414 : // Text
415 81 : QString text = xml.text().toString();
416 81 : if (lastTag != "pre" && lastTag != "code") {
417 : // This happens due to some kind of auto-formatting glitch.
418 80 : text.replace("\n", " ");
419 : }
420 :
421 81 : writer.writeCharacters(text);
422 81 : lastTag = "#text";
423 131 : } else if (xml.isEntityReference() && 0 == skip) {
424 : // Entity
425 0 : QString entity = xml.name().toString();
426 0 : writer.writeEntityReference(entity);
427 0 : lastTag = "#entity";
428 50 : } else if (xml.isStartDocument()) {
429 : // Doc start
430 23 : writer.writeStartDocument(xml.documentVersion().toString());
431 27 : } else if (xml.isEndDocument()) {
432 : // Doc end;xml.documentVersion()
433 23 : writer.writeEndElement();
434 : }
435 : }
436 :
437 23 : if (xml.hasError()) {
438 0 : qDebug() << "QXmlStreamReader had error: " << xml.errorString();
439 : }
440 :
441 23 : if (writer.hasError()) {
442 0 : qDebug() << "QXmlStreamWriter had an error of some kind.";
443 : }
444 :
445 : // Return new document.
446 23 : return output;
447 23 : }
448 :
449 27 : void RawFeedRewriter::postProcess()
450 : {
451 : // Iterate over all the news we have.
452 54 : for (RawNews* news : *newsList) {
453 27 : if (news->content.size()) {
454 0 : postProcessDocString(news->content);
455 : }
456 :
457 27 : if (news->description.size()) {
458 26 : postProcessDocString(news->description);
459 : }
460 : }
461 27 : }
462 :
463 26 : void RawFeedRewriter::postProcessDocString(QString &docString)
464 : {
465 : // The R is for Redundant!
466 26 : docString.replace("\r", "");
467 :
468 : // Rip out headers/footers.
469 26 : docString.replace("<?xml version=\"1.0\"?><html>", "");
470 26 : docString.replace("<body>", "");
471 26 : docString.replace("</body></html>", "");
472 26 : docString.replace("<body/></html>", ""); // Empty body!
473 :
474 : // This happens.
475 26 : docString = docString.trimmed();
476 26 : }
477 :
478 7 : void RawFeedRewriter::imageResize(int width, int height, int *newWidth, int *newHeight)
479 : {
480 7 : *newWidth = width;
481 7 : *newHeight = height;
482 :
483 7 : if (width >= MAX_ELEMENT_WIDTH) {
484 : // Scale down the image.
485 7 : *newWidth = MAX_ELEMENT_WIDTH;
486 7 : *newHeight = (double) height / (double) width * (double) MAX_ELEMENT_WIDTH;
487 : }
488 7 : }
489 :
490 88 : void RawFeedRewriter::removeNewlinesBothSides(QString &docString)
491 : {
492 98 : while (docString.startsWith("\n")) {
493 10 : docString = docString.mid(1);
494 : }
495 :
496 95 : while (docString.endsWith("\n")) {
497 7 : docString = docString.left(docString.length() - 1);
498 : }
499 88 : }
500 :
501 3 : QString RawFeedRewriter::rewriteTextOnlyNews(QString input)
502 : {
503 3 : QString output;
504 :
505 : // Keep it simple, stupid.
506 3 : input = input.trimmed();
507 3 : input.replace("\r\n", "\r");
508 3 : input.replace("\r", "\n");
509 :
510 3 : QStringList list = input.split('\n', Qt::SkipEmptyParts);
511 9 : for (QString line : list) {
512 : // Trim lines, and skip empty ones.
513 6 : QString trimmed = line.trimmed();
514 6 : if (!trimmed.isEmpty()) {
515 5 : output += "<p>" + trimmed + "</p>";
516 : }
517 6 : }
518 :
519 : // As a signal to the 2nd pass, we prepend the output with an ASCII beep character. 2nd pass
520 : // will remove this and return the string without further modification.s
521 3 : output = '\07' + output;
522 :
523 6 : return output;
524 3 : }
525 :
526 3 : void RawFeedRewriter::onImageGrabberFinished()
527 : {
528 : // Gotta do this, g.
529 3 : rewriteAllSecondPass();
530 3 : postProcess();
531 :
532 3 : emit finished();
533 3 : }
|