Line data Source code
1 : #include "HTMLSanitizer.h"
2 : #include "FangLogging.h"
3 :
4 : #include <QXmlStreamReader>
5 : #include <QXmlStreamWriter>
6 : #include <QStack>
7 :
8 : #include "QImageCache.h"
9 : #include "WebUtilities.h"
10 : #include "QTidyLibClassic.h"
11 :
12 : // Strings.
13 : #define S_WIDTH "width"
14 : #define S_HEIGHT "height"
15 : #define S_SRC "src"
16 : #define S_IMG "img"
17 : #define S_HREF "href"
18 : #define S_ID "id"
19 :
20 : namespace {
21 :
22 : /*!
23 : \brief Represents a DOM node during HTML parsing.
24 : */
25 : class DOMNode {
26 : public:
27 259 : DOMNode(QString tagName, int intID) :
28 259 : tagName(tagName),
29 259 : intID(intID),
30 259 : nonEmptyTextCount(0),
31 259 : numChildren(0)
32 259 : {}
33 :
34 : // Stack requires a default c'tor
35 : DOMNode() :
36 : intID(0),
37 : nonEmptyTextCount(0),
38 : numChildren(0)
39 : {}
40 :
41 : QString tagName;
42 : int intID;
43 : int nonEmptyTextCount;
44 : int numChildren;
45 : };
46 :
47 : } // anonymous namespace
48 :
49 41 : HTMLSanitizer::HTMLSanitizer(QObject *parent) :
50 : QObject(parent),
51 41 : currentId(0)
52 : {
53 41 : tagsToRemove << "script" // Javascript
54 82 : << "title" // Titles WTF?
55 82 : << "head" // Don't need head
56 82 : << "style" // Custom styles.
57 82 : << "iframe" // Iframes!
58 82 : << "object" // Plugins!
59 82 : << "embed" // Other plugins!
60 41 : << "hr"; // No horizontals allowed; they're ugly.
61 :
62 41 : classesToRemove << "feedflare" // Feedburger's 37 pieces of flare
63 82 : << "mf-viral" // Motherfucking viral?
64 41 : << "service-links-stumbleupon"; // StubbleUponYourFace
65 :
66 41 : shareButtonURLs << "twitter.com/home?status"
67 82 : << "plus.google.com/shar"
68 82 : << "facebook.com/shar"
69 82 : << "feedsportal.com/"
70 82 : << "api.tweetmeme.com/"
71 82 : << "stumbleupon.com/submit"
72 41 : << "share.feedsportal.com/share";
73 :
74 41 : containerTags << "p"
75 82 : << "div"
76 82 : << "span"
77 41 : << "pre";
78 :
79 58 : urlTransform = [](const QString& url) {
80 17 : return WebUtilities::urlFixup(url);
81 41 : };
82 41 : }
83 :
84 56 : void HTMLSanitizer::reset()
85 : {
86 56 : idsToDelete.clear();
87 56 : currentId = 0;
88 56 : }
89 :
90 454 : bool HTMLSanitizer::isHTMLEmpty(const QString& html)
91 : {
92 454 : QString copy = html;
93 454 : copy.replace(" ", "");
94 454 : copy.replace("\t", "");
95 454 : copy.replace("\n", "");
96 :
97 908 : return copy.size() == 0;
98 454 : }
99 :
100 25 : bool HTMLSanitizer::isShareURL(const QString &url)
101 : {
102 188 : for (const QString& shareURL : shareButtonURLs) {
103 165 : if (url.contains(shareURL, Qt::CaseInsensitive)) {
104 2 : return true;
105 : }
106 : }
107 :
108 23 : return false;
109 : }
110 :
111 285 : QString HTMLSanitizer::intToID(int id)
112 : {
113 285 : return "FangID_" + QString::number(id);
114 : }
115 :
116 53 : QString HTMLSanitizer::sanitize(const QString &document, QSet<QUrl> &imageURLs)
117 : {
118 : // We use TidyLib to convert the (potentially crappy) HTML into proper
119 : // XHTML. This will add a doctype and other unwanted headers/footers, so we strip those
120 : // out in a separate post-processing method. You'll see.
121 53 : QString doc = QTidyLibClassic::toXhtml("<html><body>" + document + "</body></html>");
122 53 : if (doc.isEmpty()) {
123 0 : qCDebug(logRewriter) << "Error loading HTML document";
124 :
125 0 : return "";
126 : }
127 :
128 : // Swap out non-breaking spaces here since QXmlStreamReader doesn't handle them well.
129 53 : doc.replace(" ", " ", Qt::CaseInsensitive);
130 :
131 : // We're going to count the number of tags to determine if this is a real HTML document,
132 : // or a text document.
133 53 : int tagCount = 0;
134 :
135 53 : QXmlStreamReader xml;
136 53 : xml.addData(doc);
137 :
138 53 : QString output;
139 53 : QXmlStreamWriter writer(&output);
140 53 : writer.setAutoFormatting(false);
141 :
142 : // If we're skipping elements, this is >= 1
143 53 : int skip = 0;
144 :
145 : // Current stack.
146 53 : QStack<DOMNode> stack;
147 :
148 : // Was the last node text?
149 53 : bool lastWasText = false;
150 :
151 : // Track nesting depth inside <pre> to preserve whitespace.
152 53 : int preDepth = 0;
153 :
154 1720 : while (!xml.atEnd()) {
155 : // Grab the next thingie.
156 1667 : xml.readNext();
157 :
158 1667 : if (xml.isStartElement()) {
159 : // Start
160 441 : tagCount++;
161 :
162 441 : if (0 == skip) {
163 332 : QString tagName = xml.name().toString().toLower();
164 664 : QString classValue = xml.attributes().value("class").toString();
165 664 : QString href = xml.attributes().value(S_HREF).toString();
166 :
167 602 : if (tagsToRemove.contains(tagName) ||
168 539 : classesToRemove.contains(classValue) || // Delete known bad classes
169 904 : (tagName == "a" && isShareURL(href)) || // Delete share links
170 609 : (tagName == "br" && !lastWasText)) { // Delete br's that weren't preceeded by text.
171 : // Skip it good!
172 73 : skip = 1;
173 : } else {
174 : // Write the tag.
175 259 : writer.writeStartElement(tagName);
176 :
177 259 : currentId++;
178 518 : writer.writeAttribute(S_ID, intToID(currentId));
179 :
180 : // If there's a parent node, add a child.
181 259 : if (stack.size()) {
182 206 : stack.top().numChildren++;
183 : }
184 :
185 : // Push it.
186 259 : stack.push(DOMNode(tagName, currentId));
187 :
188 259 : if (tagName == "pre") {
189 5 : preDepth++;
190 : }
191 :
192 : // Anchor tags.
193 282 : if (tagName == "a" && xml.attributes().hasAttribute(S_HREF)) {
194 69 : writer.writeAttribute(S_HREF, xml.attributes().value(S_HREF).toString());
195 : }
196 :
197 : // Image tags.
198 276 : if (tagName == S_IMG && xml.attributes().hasAttribute(S_SRC)) {
199 34 : QString imgSrc = xml.attributes().value(S_SRC).toString();
200 17 : if (urlTransform) {
201 17 : imgSrc = urlTransform(imgSrc);
202 : }
203 34 : writer.writeAttribute(S_SRC, imgSrc);
204 :
205 : // WordPress emoji: class="wp-smiley" images are inline emoji
206 : // that should render at text size (~16px), not at their
207 : // natural pixel dimensions (typically 72x72).
208 34 : QString imgClassValue = xml.attributes().value("class").toString();
209 17 : bool isSmiley = imgClassValue.contains("wp-smiley");
210 17 : if (isSmiley) {
211 2 : writer.writeAttribute(S_WIDTH, "16");
212 2 : writer.writeAttribute(S_HEIGHT, "16");
213 2 : writer.writeAttribute("data-smiley", "1");
214 : }
215 :
216 : // Check for tracking pixels using HTML dimensions.
217 17 : if (!isSmiley) {
218 30 : QString sWidth = xml.attributes().value(S_WIDTH).toString();
219 30 : QString sHeight = xml.attributes().value(S_HEIGHT).toString();
220 :
221 : bool widthOK, heightOK;
222 15 : int width = sWidth.toInt(&widthOK);
223 15 : int height = sHeight.toInt(&heightOK);
224 :
225 15 : if (widthOK && heightOK) {
226 10 : if (width < 3 || height < 3) {
227 : // Delete tiny images (tracking pixels).
228 3 : idsToDelete << intToID(currentId);
229 : } else {
230 : // Pass dimensions as metadata for finalize() to use
231 : // when the image fetch fails and we need to verify
232 : // this isn't a tracking pixel.
233 14 : writer.writeAttribute(S_WIDTH, sWidth);
234 14 : writer.writeAttribute(S_HEIGHT, sHeight);
235 : }
236 : }
237 15 : }
238 :
239 : // Fetch images for caching and dimension verification.
240 17 : if (!idsToDelete.contains(intToID(currentId))) {
241 14 : imageURLs << imgSrc;
242 : }
243 17 : }
244 :
245 259 : lastWasText = false;
246 : }
247 332 : } else {
248 109 : skip++;
249 : }
250 1226 : } else if (xml.isEndElement()) {
251 441 : QString tagName = xml.name().toString().toLower();
252 :
253 : // End
254 441 : if (0 == skip) {
255 259 : writer.writeEndElement();
256 :
257 : // Pop our node and investigate.
258 259 : DOMNode dom = stack.pop();
259 :
260 259 : if (tagName == "pre") {
261 5 : preDepth--;
262 : }
263 :
264 : // If it's a container and we didn't write any text, then delete this tag in the
265 : // second pass.
266 259 : if (containerTags.contains(tagName) && dom.nonEmptyTextCount == 0 && dom.numChildren == 0) {
267 : //
268 : // This doesn't work -- at the very least the IDs are wrong. We need to
269 : // employ a stack here.
270 : //
271 6 : idsToDelete << intToID(dom.intID);
272 : }
273 :
274 259 : lastWasText = false;
275 259 : } else {
276 182 : skip--;
277 : }
278 1226 : } else if (xml.isCharacters() && 0 == skip) {
279 : // Text
280 454 : QString text = xml.text().toString();
281 454 : bool isEmpty = isHTMLEmpty(text);
282 :
283 : // Don't allow pure empty tags, though a single space is ok.
284 454 : if (!isEmpty || text == " ") {
285 124 : if (preDepth == 0) {
286 118 : bool addSpaceStart = text.startsWith('\n');
287 118 : bool addSpaceEnd = text.endsWith('\n');
288 :
289 : // Text can start or end with a newline -- delete 'em.
290 118 : removeNewlinesBothSides(text);
291 :
292 : // Add back extra spaces so text doesn'truntogether.
293 118 : if (addSpaceStart) {
294 12 : text = ' ' + text;
295 : }
296 :
297 118 : if (addSpaceEnd) {
298 7 : text = text + ' ';
299 : }
300 : }
301 :
302 : // Write the text!
303 124 : writer.writeCharacters(text);
304 :
305 124 : if (!isEmpty) {
306 117 : stack.top().nonEmptyTextCount++;
307 : }
308 :
309 124 : lastWasText = true;
310 : }
311 785 : } else if (xml.isEntityReference() && 0 == skip) {
312 : // Entity
313 0 : QString entity = xml.name().toString();
314 0 : writer.writeEntityReference(entity);
315 331 : } else if (xml.isStartDocument()) {
316 : // Doc start
317 53 : writer.writeStartDocument("1.0");
318 278 : } else if (xml.isEndDocument()) {
319 : // Doc end
320 53 : writer.writeEndElement();
321 : }
322 : }
323 :
324 53 : if (xml.hasError()) {
325 0 : qCDebug(logRewriter) << "Error reading XML: " << xml.errorString();
326 : }
327 :
328 53 : if (writer.hasError()) {
329 0 : qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
330 : }
331 :
332 :
333 58 : if (tagCount <= 5 && output !=
334 58 : "<?xml version=\"1.0\"?><html id=\"FangID_1\"><body id=\"FangID_2\"/></html>") {
335 : // Turns out we're not dealing with an HTML document: there's not enough tags, and it's
336 : // not an empty document (which can be caused by bad HTML.)
337 : // Ditch the Tidy'd doc and rewrite as plain text from the original.
338 4 : return textToHtml(document);
339 : }
340 :
341 : // Return new document.
342 49 : return output;
343 53 : }
344 :
345 118 : void HTMLSanitizer::removeNewlinesBothSides(QString &docString)
346 : {
347 130 : while (docString.startsWith("\n")) {
348 12 : docString = docString.mid(1);
349 : }
350 :
351 125 : while (docString.endsWith("\n")) {
352 7 : docString = docString.left(docString.length() - 1);
353 : }
354 118 : }
355 :
356 6 : QString HTMLSanitizer::textToHtml(const QString& input)
357 : {
358 6 : QString output;
359 :
360 : // Keep it simple, stupid.
361 6 : QString cleaned = input.trimmed();
362 6 : cleaned.replace("\r\n", "\r");
363 6 : cleaned.replace("\r", "\n");
364 :
365 6 : QStringList list = cleaned.split('\n', Qt::SkipEmptyParts);
366 17 : for (const QString& line : list) {
367 : // Trim lines, and skip empty ones.
368 11 : QString trimmed = line.trimmed();
369 11 : if (!trimmed.isEmpty()) {
370 10 : output += "<p>" + trimmed + "</p>";
371 : }
372 11 : }
373 :
374 : // As a signal to the 2nd pass, we prepend the output with an ASCII beep character. 2nd pass
375 : // will remove this and return the string without further modification.
376 6 : output = '\07' + output;
377 :
378 12 : return output;
379 6 : }
380 :
381 42 : QString HTMLSanitizer::finalize(const QString &html, const QMap<QUrl, ImageData> &imageResults)
382 : {
383 : // If it was a text-only document, we've prepended it with an ASCII beep. All we have to do
384 : // here is remove the beep and return it.
385 42 : if (html.startsWith('\07')) {
386 3 : return html.mid(1);
387 : }
388 :
389 39 : QXmlStreamReader xml;
390 39 : xml.addData(html);
391 :
392 39 : QString output;
393 39 : QXmlStreamWriter writer(&output);
394 39 : writer.setAutoFormatting(false);
395 39 : int skip = 0; // Skip stack.
396 39 : int preDepth = 0; // Track nesting depth inside <pre> to preserve whitespace.
397 39 : QString lastTag = "";
398 :
399 658 : while (!xml.atEnd()) {
400 : // Grab the next thingie.
401 619 : xml.readNext();
402 :
403 619 : if (xml.isStartElement()) {
404 218 : if (0 == skip) {
405 : // Start
406 218 : QString tagName = xml.name().toString().toLower();
407 436 : QString id = xml.attributes().value(S_ID).toString();
408 :
409 218 : if (idsToDelete.contains(id)) {
410 : // We need to delete this tag! Skip it.
411 8 : skip = 1;
412 210 : } else if (tagName == S_IMG) {
413 36 : QString url = xml.attributes().value(S_SRC).toString();
414 18 : QString srcToUse = url;
415 18 : bool keepImage = false;
416 36 : bool isSmiley = xml.attributes().value("data-smiley").toString() == "1";
417 :
418 18 : int width = 0;
419 18 : int height = 0;
420 :
421 18 : ImageData imageData = imageResults.value(url);
422 18 : if (imageData.isValid()) {
423 14 : if (isSmiley) {
424 : // WordPress emoji: use the small dimensions from
425 : // sanitize() instead of the fetched pixel size.
426 4 : width = xml.attributes().value(S_WIDTH).toInt();
427 4 : height = xml.attributes().value(S_HEIGHT).toInt();
428 : } else {
429 12 : width = imageData.image.width();
430 12 : height = imageData.image.height();
431 : }
432 :
433 14 : if (width > 2 && height > 2) {
434 11 : QString cachedPath = "/images/" + QImageCache::saveImage(url, imageData);
435 11 : if (!cachedPath.isEmpty()) {
436 11 : srcToUse = cachedPath;
437 : }
438 11 : keepImage = true;
439 11 : }
440 16 : } else if (xml.attributes().hasAttribute(S_WIDTH) &&
441 8 : xml.attributes().hasAttribute(S_HEIGHT)) {
442 : // Fetch failed but image has known good dimensions from
443 : // sanitize() - keep it with the original URL.
444 4 : width = xml.attributes().value(S_WIDTH).toInt();
445 4 : height = xml.attributes().value(S_HEIGHT).toInt();
446 2 : keepImage = true;
447 : }
448 : // else: fetch failed and no known dimensions - skip.
449 : // Could be a tracking pixel we can't verify.
450 :
451 18 : if (keepImage) {
452 13 : writer.writeStartElement(tagName);
453 26 : writer.writeAttribute(S_SRC, srcToUse);
454 13 : if (width > 0 && height > 0) {
455 26 : writer.writeAttribute(S_WIDTH, QString::number(width));
456 26 : writer.writeAttribute(S_HEIGHT, QString::number(height));
457 : }
458 13 : if (isSmiley) {
459 2 : writer.writeAttribute("class", "smiley");
460 : }
461 13 : if (srcToUse != url) {
462 22 : writer.writeAttribute("data-original-src", url);
463 : }
464 13 : lastTag = tagName;
465 : } else {
466 5 : skip = 1;
467 : }
468 18 : } else {
469 : // Write the tag and all attributes (except for ID)
470 192 : writer.writeStartElement(tagName);
471 407 : for (const QXmlStreamAttribute& attribute : xml.attributes()) {
472 215 : if (attribute.name().toString() != S_ID) {
473 23 : writer.writeAttribute(attribute);
474 : }
475 192 : }
476 :
477 192 : if (tagName == "pre") {
478 4 : preDepth++;
479 : }
480 :
481 192 : lastTag = tagName;
482 : }
483 218 : } else {
484 0 : skip++;
485 : }
486 401 : } else if (xml.isEndElement()) {
487 : // End
488 218 : if (0 == skip) {
489 205 : if (xml.name().toString().toLower() == "pre") {
490 4 : preDepth--;
491 : }
492 205 : writer.writeEndElement();
493 : } else {
494 13 : skip--;
495 : }
496 183 : } else if (xml.isCharacters() && 0 == skip) {
497 : // Text
498 101 : QString text = xml.text().toString();
499 101 : if (preDepth == 0) {
500 : // Outside preformatted blocks, collapse newlines to spaces.
501 96 : text.replace("\n", " ");
502 : }
503 :
504 101 : writer.writeCharacters(text);
505 101 : lastTag = "#text";
506 183 : } else if (xml.isEntityReference() && 0 == skip) {
507 : // Entity
508 0 : QString entity = xml.name().toString();
509 0 : writer.writeEntityReference(entity);
510 0 : lastTag = "#entity";
511 82 : } else if (xml.isStartDocument()) {
512 : // Doc start
513 39 : writer.writeStartDocument(xml.documentVersion().toString());
514 43 : } else if (xml.isEndDocument()) {
515 : // Doc end;xml.documentVersion()
516 39 : writer.writeEndElement();
517 : }
518 : }
519 :
520 39 : if (xml.hasError()) {
521 0 : qCDebug(logRewriter) << "QXmlStreamReader had error: " << xml.errorString();
522 : }
523 :
524 39 : if (writer.hasError()) {
525 0 : qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
526 : }
527 :
528 : // Post-process and return.
529 39 : postProcessDocString(output);
530 39 : return output;
531 39 : }
532 :
533 39 : void HTMLSanitizer::postProcessDocString(QString &docString)
534 : {
535 : // The R is for Redundant!
536 39 : docString.replace("\r", "");
537 :
538 : // Rip out headers/footers.
539 39 : docString.replace("<?xml version=\"1.0\"?><html>", "");
540 39 : docString.replace("<body>", "");
541 39 : docString.replace("</body></html>", "");
542 39 : docString.replace("<body/></html>", ""); // Empty body!
543 :
544 : // This happens.
545 39 : docString = docString.trimmed();
546 39 : }
|