Line data Source code
1 : #ifndef HTMLSANITIZER_H
2 : #define HTMLSANITIZER_H
3 :
4 : #include <QObject>
5 : #include <QSet>
6 : #include <QMap>
7 : #include <QUrl>
8 : #include <QString>
9 :
10 : #include "../FangObject.h"
11 : #include "ImageGrabber.h"
12 : #include "WebPageGrabber.h"
13 :
14 : /*!
15 : \brief Represents a DOM node during HTML parsing.
16 : */
17 : class DOMNode {
18 : public:
19 174 : DOMNode(QString tagName, int intID) :
20 174 : tagName(tagName),
21 174 : intID(intID),
22 174 : nonEmptyTextCount(0),
23 174 : numChildren(0)
24 174 : {}
25 :
26 : // Stack requires a default c'tor
27 : DOMNode() :
28 : intID(0),
29 : nonEmptyTextCount(0),
30 : numChildren(0)
31 : {}
32 :
33 : QString tagName;
34 : int intID;
35 : int nonEmptyTextCount;
36 : int numChildren;
37 : };
38 :
39 : /*!
40 : \brief HTMLSanitizer handles HTML cleanup and XHTML conversion.
41 :
42 : This is a synchronous component that:
43 : - Converts HTML to XHTML via TidyLib
44 : - Removes unwanted tags (script, style, iframe, etc.)
45 : - Removes social media share buttons
46 : - Removes tracking pixels
47 : - Extracts image URLs that need dimension fetching
48 : - Handles two-pass processing to remove empty elements
49 : */
50 : class HTMLSanitizer : public FangObject
51 : {
52 : Q_OBJECT
53 : public:
54 : explicit HTMLSanitizer(QObject *parent = nullptr);
55 :
56 : /*!
57 : \brief First pass: sanitize HTML, returning clean XHTML and extracted image URLs.
58 : \param html Input HTML
59 : \param imageURLs [out] URLs of images that need dimension fetching
60 : \return Sanitized XHTML with FangID attributes for second pass
61 : */
62 : QString sanitize(const QString& html, QSet<QUrl>& imageURLs);
63 :
64 : /*!
65 : \brief Second pass: remove marked elements and apply image data.
66 : \param html First-pass output
67 : \param imageResults Map of URL to image data (from ImageGrabber)
68 : \return Final cleaned HTML
69 : */
70 : QString finalize(const QString& html, const QMap<QUrl, ImageData>& imageResults);
71 :
72 : /*!
73 : \brief Convert text-only content to HTML paragraphs.
74 : \param text Plain text input
75 : \return HTML with paragraphs
76 : */
77 : QString textToHtml(const QString& text);
78 :
79 : /*!
80 : \brief Reset state for a new batch of documents.
81 :
82 : Call this before processing a new set of documents to clear
83 : the idsToDelete set and reset the ID counter.
84 : */
85 : void reset();
86 :
87 : private:
88 : WebPageGrabber webPageGrabber;
89 :
90 : // Configuration
91 : QSet<QString> tagsToRemove;
92 : QSet<QString> classesToRemove;
93 : QList<QString> shareButtonURLs;
94 : QSet<QString> containerTags;
95 :
96 : // State for two-pass processing
97 : QSet<QString> idsToDelete;
98 : int currentId;
99 :
100 : // Helpers
101 : bool isShareURL(const QString& url);
102 : bool isHTMLEmpty(const QString& html);
103 : QString intToID(int id);
104 : void removeNewlinesBothSides(QString& str);
105 : void postProcessDocString(QString& docString);
106 : };
107 :
108 : #endif // HTMLSANITIZER_H
|