Line data Source code
1 : #include "HTMLSanitizer.h"
2 : #include "FangLogging.h"
3 :
4 : #include <QXmlStreamReader>
5 : #include <QXmlStreamWriter>
6 : #include <QStack>
7 :
8 : #include "ImageCache.h"
9 : #include "NetworkUtilities.h"
10 :
11 : // Strings.
12 : #define S_WIDTH "width"
13 : #define S_HEIGHT "height"
14 : #define S_SRC "src"
15 : #define S_IMG "img"
16 : #define S_HREF "href"
17 : #define S_ID "id"
18 :
19 38 : HTMLSanitizer::HTMLSanitizer(QObject *parent) :
20 : FangObject(parent),
21 38 : webPageGrabber(false),
22 38 : currentId(0)
23 : {
24 38 : tagsToRemove << "script" // Javascript
25 76 : << "title" // Titles WTF?
26 76 : << "head" // Don't need head
27 76 : << "style" // Custom styles.
28 76 : << "iframe" // Iframes!
29 76 : << "object" // Plugins!
30 76 : << "embed" // Other plugins!
31 38 : << "hr"; // No horizontals allowed; they're ugly.
32 :
33 38 : classesToRemove << "feedflare" // Feedburger's 37 pieces of flare
34 76 : << "mf-viral" // Motherfucking viral?
35 38 : << "service-links-stumbleupon"; // StubbleUponYourFace
36 :
37 38 : shareButtonURLs << "twitter.com/home?status"
38 76 : << "plus.google.com/shar"
39 76 : << "facebook.com/shar"
40 76 : << "feedsportal.com/"
41 76 : << "api.tweetmeme.com/"
42 76 : << "stumbleupon.com/submit"
43 38 : << "share.feedsportal.com/share";
44 :
45 38 : containerTags << "p"
46 76 : << "div"
47 76 : << "span"
48 38 : << "pre";
49 38 : }
50 :
51 31 : void HTMLSanitizer::reset()
52 : {
53 31 : idsToDelete.clear();
54 31 : currentId = 0;
55 31 : }
56 :
57 288 : bool HTMLSanitizer::isHTMLEmpty(const QString& html)
58 : {
59 288 : QString copy = html;
60 288 : copy.replace(" ", "");
61 288 : copy.replace("\t", "");
62 288 : copy.replace("\n", "");
63 :
64 576 : return copy.size() == 0;
65 288 : }
66 :
67 24 : bool HTMLSanitizer::isShareURL(const QString &url)
68 : {
69 187 : for (const QString& shareURL : shareButtonURLs) {
70 164 : if (url.contains(shareURL, Qt::CaseInsensitive)) {
71 1 : return true;
72 : }
73 : }
74 :
75 23 : return false;
76 : }
77 :
78 190 : QString HTMLSanitizer::intToID(int id)
79 : {
80 190 : return "FangID_" + QString::number(id);
81 : }
82 :
83 30 : QString HTMLSanitizer::sanitize(const QString &document, QSet<QUrl> &imageURLs)
84 : {
85 : // We use TidyLib via WebPageGrabber to convert the (potentially crappy) HTML into proper
86 : // XHTML. This will add a doctype and other unwanted headers/footers, so we strip those
87 : // out in a separate post-processing method. You'll see.
88 30 : QString* doc = webPageGrabber.load("<html><body>" + document + "</body></html>");
89 30 : if (doc == nullptr) {
90 0 : qCDebug(logRewriter) << "Error loading HTML document";
91 :
92 0 : return "";
93 : }
94 :
95 : // Swap out non-breaking spaces here since QXmlStreamReader doesn't handle them well.
96 30 : doc->replace(" ", " ", Qt::CaseInsensitive);
97 :
98 : // We're going to count the number of tags to determine if this is a real HTML document,
99 : // or a text document.
100 30 : int tagCount = 0;
101 :
102 30 : QXmlStreamReader xml;
103 30 : xml.addData(*doc);
104 :
105 30 : QString output;
106 30 : QXmlStreamWriter writer(&output);
107 30 : writer.setAutoFormatting(false);
108 :
109 : // If we're skipping elements, this is >= 1
110 30 : int skip = 0;
111 :
112 : // Current stack.
113 30 : QStack<DOMNode> stack;
114 :
115 : // Was the last node text?
116 30 : bool lastWasText = false;
117 :
118 : // Track nesting depth inside <pre> to preserve whitespace.
119 30 : int preDepth = 0;
120 :
121 1052 : while (!xml.atEnd()) {
122 : // Grab the next thingie.
123 1022 : xml.readNext();
124 :
125 1022 : if (xml.isStartElement()) {
126 : // Start
127 275 : tagCount++;
128 :
129 275 : if (0 == skip) {
130 214 : QString tagName = xml.name().toString().toLower();
131 428 : QString classValue = xml.attributes().value("class").toString();
132 428 : QString href = xml.attributes().value(S_HREF).toString();
133 :
134 394 : if (tagsToRemove.contains(tagName) ||
135 360 : classesToRemove.contains(classValue) || // Delete known bad classes
136 603 : (tagName == "a" && isShareURL(href)) || // Delete share links
137 399 : (tagName == "br" && !lastWasText)) { // Delete br's that weren't preceeded by text.
138 : // Skip it good!
139 40 : skip = 1;
140 : } else {
141 : // Write the tag.
142 174 : writer.writeStartElement(tagName);
143 :
144 174 : currentId++;
145 348 : writer.writeAttribute(S_ID, intToID(currentId));
146 :
147 : // If there's a parent node, add a child.
148 174 : if (stack.size()) {
149 144 : stack.top().numChildren++;
150 : }
151 :
152 : // Push it.
153 174 : stack.push(DOMNode(tagName, currentId));
154 :
155 174 : if (tagName == "pre") {
156 4 : preDepth++;
157 : }
158 :
159 : // Anchor tags.
160 197 : if (tagName == "a" && xml.attributes().hasAttribute(S_HREF)) {
161 69 : writer.writeAttribute(S_HREF, xml.attributes().value(S_HREF).toString());
162 : }
163 :
164 : // Image tags.
165 184 : if (tagName == S_IMG && xml.attributes().hasAttribute(S_SRC)) {
166 20 : QString imgSrc = NetworkUtilities::urlFixup(xml.attributes().value(S_SRC).toString());
167 20 : writer.writeAttribute(S_SRC, imgSrc);
168 :
169 : // Check for tracking pixels using HTML dimensions.
170 20 : QString sWidth = xml.attributes().value(S_WIDTH).toString();
171 20 : QString sHeight = xml.attributes().value(S_HEIGHT).toString();
172 :
173 : bool widthOK, heightOK;
174 10 : int width = sWidth.toInt(&widthOK);
175 10 : int height = sHeight.toInt(&heightOK);
176 :
177 10 : if (widthOK && heightOK) {
178 7 : if (width < 3 || height < 3) {
179 : // Delete tiny images (tracking pixels).
180 1 : idsToDelete << intToID(currentId);
181 : } else {
182 : // Pass dimensions as metadata for finalize() to use
183 : // when the image fetch fails and we need to verify
184 : // this isn't a tracking pixel.
185 12 : writer.writeAttribute(S_WIDTH, sWidth);
186 12 : writer.writeAttribute(S_HEIGHT, sHeight);
187 : }
188 : }
189 :
190 : // Fetch images for caching and dimension verification.
191 10 : if (!idsToDelete.contains(intToID(currentId))) {
192 9 : imageURLs << imgSrc;
193 : }
194 10 : }
195 : }
196 :
197 214 : lastWasText = false;
198 214 : } else {
199 61 : skip++;
200 : }
201 747 : } else if (xml.isEndElement()) {
202 275 : QString tagName = xml.name().toString().toLower();
203 :
204 : // End
205 275 : if (0 == skip) {
206 174 : writer.writeEndElement();
207 :
208 : // Pop our node and investigate.
209 174 : DOMNode dom = stack.pop();
210 :
211 174 : if (tagName == "pre") {
212 4 : preDepth--;
213 : }
214 :
215 : // If it's a container and we didn't write any text, then delete this tag in the
216 : // second pass.
217 174 : if (containerTags.contains(tagName) && dom.nonEmptyTextCount == 0 && dom.numChildren == 0) {
218 : //
219 : // This doesn't work -- at the very least the IDs are wrong. We need to
220 : // employ a stack here.
221 : //
222 5 : idsToDelete << intToID(dom.intID);
223 : }
224 :
225 174 : lastWasText = false;
226 174 : } else {
227 101 : skip--;
228 : }
229 747 : } else if (xml.isCharacters() && 0 == skip) {
230 : // Text
231 288 : QString text = xml.text().toString();
232 288 : bool isEmpty = isHTMLEmpty(text);
233 :
234 : // Don't allow pure empty tags, though a single space is ok.
235 288 : if (!isEmpty || text == " ") {
236 93 : if (preDepth == 0) {
237 88 : bool addSpaceStart = text.startsWith('\n');
238 88 : bool addSpaceEnd = text.endsWith('\n');
239 :
240 : // Text can start or end with a newline -- delete 'em.
241 88 : removeNewlinesBothSides(text);
242 :
243 : // Add back extra spaces so text doesn'truntogether.
244 88 : if (addSpaceStart) {
245 9 : text = ' ' + text;
246 : }
247 :
248 88 : if (addSpaceEnd) {
249 6 : text = text + ' ';
250 : }
251 : }
252 :
253 : // Write the text!
254 93 : writer.writeCharacters(text);
255 :
256 93 : if (!isEmpty) {
257 86 : stack.top().nonEmptyTextCount++;
258 : }
259 :
260 93 : lastWasText = true;
261 : }
262 472 : } else if (xml.isEntityReference() && 0 == skip) {
263 : // Entity
264 0 : QString entity = xml.name().toString();
265 0 : writer.writeEntityReference(entity);
266 184 : } else if (xml.isStartDocument()) {
267 : // Doc start
268 30 : writer.writeStartDocument("1.0");
269 154 : } else if (xml.isEndDocument()) {
270 : // Doc end
271 30 : writer.writeEndElement();
272 : }
273 : }
274 :
275 30 : if (xml.hasError()) {
276 0 : qCDebug(logRewriter) << "Error reading XML: " << xml.errorString();
277 : }
278 :
279 30 : if (writer.hasError()) {
280 0 : qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
281 : }
282 :
283 :
284 34 : if (tagCount <= 5 && output !=
285 34 : "<?xml version=\"1.0\"?><html id=\"FangID_1\"><body id=\"FangID_2\"/></html>") {
286 : // Turns out we're not dealing with an HTML document: there's not enough tags, and it's
287 : // not an empty document (which can be caused by bad HTML.)
288 : // Ditch the Tidy'd doc and rewrite as plain text from the original.
289 3 : return textToHtml(document);
290 : }
291 :
292 : // Return new document.
293 27 : return output;
294 30 : }
295 :
296 37 : QString HTMLSanitizer::finalize(const QString &html, const QMap<QUrl, ImageData> &imageResults)
297 : {
298 : // If it was a text-only document, we've prepended it with an ASCII beep. All we have to do
299 : // here is remove the beep and return it.
300 37 : if (html.startsWith('\07')) {
301 3 : return html.mid(1);
302 : }
303 :
304 34 : QXmlStreamReader xml;
305 34 : xml.addData(html);
306 :
307 34 : QString output;
308 34 : QXmlStreamWriter writer(&output);
309 34 : writer.setAutoFormatting(false);
310 34 : int skip = 0; // Skip stack.
311 34 : int preDepth = 0; // Track nesting depth inside <pre> to preserve whitespace.
312 34 : QString lastTag = "";
313 :
314 582 : while (!xml.atEnd()) {
315 : // Grab the next thingie.
316 548 : xml.readNext();
317 :
318 548 : if (xml.isStartElement()) {
319 193 : if (0 == skip) {
320 : // Start
321 193 : QString tagName = xml.name().toString().toLower();
322 386 : QString id = xml.attributes().value(S_ID).toString();
323 :
324 193 : if (idsToDelete.contains(id)) {
325 : // We need to delete this tag! Skip it.
326 6 : skip = 1;
327 187 : } else if (tagName == S_IMG) {
328 32 : QString url = xml.attributes().value(S_SRC).toString();
329 16 : QString srcToUse = url;
330 16 : bool keepImage = false;
331 :
332 16 : int width = 0;
333 16 : int height = 0;
334 :
335 16 : ImageData imageData = imageResults.value(url);
336 16 : if (imageData.isValid()) {
337 12 : width = imageData.image.width();
338 12 : height = imageData.image.height();
339 :
340 12 : if (width > 2 && height > 2) {
341 9 : QString cachedPath = ImageCache::saveImage(url, imageData);
342 9 : if (!cachedPath.isEmpty()) {
343 9 : srcToUse = cachedPath;
344 : }
345 9 : keepImage = true;
346 9 : }
347 16 : } else if (xml.attributes().hasAttribute(S_WIDTH) &&
348 8 : xml.attributes().hasAttribute(S_HEIGHT)) {
349 : // Fetch failed but image has known good dimensions from
350 : // sanitize() - keep it with the original URL.
351 4 : width = xml.attributes().value(S_WIDTH).toInt();
352 4 : height = xml.attributes().value(S_HEIGHT).toInt();
353 2 : keepImage = true;
354 : }
355 : // else: fetch failed and no known dimensions - skip.
356 : // Could be a tracking pixel we can't verify.
357 :
358 16 : if (keepImage) {
359 11 : writer.writeStartElement(tagName);
360 22 : writer.writeAttribute(S_SRC, srcToUse);
361 11 : if (width > 0 && height > 0) {
362 22 : writer.writeAttribute(S_WIDTH, QString::number(width));
363 22 : writer.writeAttribute(S_HEIGHT, QString::number(height));
364 : }
365 11 : if (srcToUse != url) {
366 18 : writer.writeAttribute("data-original-src", url);
367 : }
368 11 : lastTag = tagName;
369 : } else {
370 5 : skip = 1;
371 : }
372 16 : } else {
373 : // Write the tag and all attributes (except for ID)
374 171 : writer.writeStartElement(tagName);
375 365 : for (const QXmlStreamAttribute& attribute : xml.attributes()) {
376 194 : if (attribute.name().toString() != S_ID) {
377 23 : writer.writeAttribute(attribute);
378 : }
379 171 : }
380 :
381 171 : if (tagName == "pre") {
382 4 : preDepth++;
383 : }
384 :
385 171 : lastTag = tagName;
386 : }
387 193 : } else {
388 0 : skip++;
389 : }
390 355 : } else if (xml.isEndElement()) {
391 : // End
392 193 : if (0 == skip) {
393 182 : if (xml.name().toString().toLower() == "pre") {
394 4 : preDepth--;
395 : }
396 182 : writer.writeEndElement();
397 : } else {
398 11 : skip--;
399 : }
400 162 : } else if (xml.isCharacters() && 0 == skip) {
401 : // Text
402 90 : QString text = xml.text().toString();
403 90 : if (preDepth == 0) {
404 : // Outside preformatted blocks, collapse newlines to spaces.
405 85 : text.replace("\n", " ");
406 : }
407 :
408 90 : writer.writeCharacters(text);
409 90 : lastTag = "#text";
410 162 : } else if (xml.isEntityReference() && 0 == skip) {
411 : // Entity
412 0 : QString entity = xml.name().toString();
413 0 : writer.writeEntityReference(entity);
414 0 : lastTag = "#entity";
415 72 : } else if (xml.isStartDocument()) {
416 : // Doc start
417 34 : writer.writeStartDocument(xml.documentVersion().toString());
418 38 : } else if (xml.isEndDocument()) {
419 : // Doc end;xml.documentVersion()
420 34 : writer.writeEndElement();
421 : }
422 : }
423 :
424 34 : if (xml.hasError()) {
425 0 : qCDebug(logRewriter) << "QXmlStreamReader had error: " << xml.errorString();
426 : }
427 :
428 34 : if (writer.hasError()) {
429 0 : qCDebug(logRewriter) << "QXmlStreamWriter had an error of some kind.";
430 : }
431 :
432 : // Post-process and return.
433 34 : postProcessDocString(output);
434 34 : return output;
435 34 : }
436 :
437 34 : void HTMLSanitizer::postProcessDocString(QString &docString)
438 : {
439 : // The R is for Redundant!
440 34 : docString.replace("\r", "");
441 :
442 : // Rip out headers/footers.
443 34 : docString.replace("<?xml version=\"1.0\"?><html>", "");
444 34 : docString.replace("<body>", "");
445 34 : docString.replace("</body></html>", "");
446 34 : docString.replace("<body/></html>", ""); // Empty body!
447 :
448 : // This happens.
449 34 : docString = docString.trimmed();
450 34 : }
451 :
452 88 : void HTMLSanitizer::removeNewlinesBothSides(QString &docString)
453 : {
454 97 : while (docString.startsWith("\n")) {
455 9 : docString = docString.mid(1);
456 : }
457 :
458 94 : while (docString.endsWith("\n")) {
459 6 : docString = docString.left(docString.length() - 1);
460 : }
461 88 : }
462 :
463 3 : QString HTMLSanitizer::textToHtml(const QString& input)
464 : {
465 3 : QString output;
466 :
467 : // Keep it simple, stupid.
468 3 : QString cleaned = input.trimmed();
469 3 : cleaned.replace("\r\n", "\r");
470 3 : cleaned.replace("\r", "\n");
471 :
472 3 : QStringList list = cleaned.split('\n', Qt::SkipEmptyParts);
473 9 : for (const QString& line : list) {
474 : // Trim lines, and skip empty ones.
475 6 : QString trimmed = line.trimmed();
476 6 : if (!trimmed.isEmpty()) {
477 5 : output += "<p>" + trimmed + "</p>";
478 : }
479 6 : }
480 :
481 : // As a signal to the 2nd pass, we prepend the output with an ASCII beep character. 2nd pass
482 : // will remove this and return the string without further modification.
483 3 : output = '\07' + output;
484 :
485 6 : return output;
486 3 : }
487 :
|