Line data Source code
1 : #ifndef RAWFEEDIMAGESIZEREWRITER_H
2 : #define RAWFEEDIMAGESIZEREWRITER_H
3 :
4 : #include <QObject>
5 : #include <QList>
6 :
7 : #include "../parser/RawNews.h"
8 : #include "../FangObject.h"
9 : #include "ImageGrabber.h"
10 : #include "WebPageGrabber.h"
11 :
12 : // Represents a DOM node.
13 : class DOMNode {
14 : public:
15 158 : DOMNode(QString tagName, int intID) :
16 158 : tagName(tagName),
17 158 : intID(intID),
18 158 : nonEmptyTextCount(0),
19 158 : numChildren(0)
20 158 : {}
21 :
22 : // Stack requires a default c'tor
23 : DOMNode() :
24 : intID(0),
25 : nonEmptyTextCount(0),
26 : numChildren(0)
27 : {}
28 :
29 : QString tagName;
30 : int intID;
31 : int nonEmptyTextCount;
32 : int numChildren;
33 : };
34 :
35 : /**
36 : * @brief Takes a "raw" HTML feed and processes it in the following ways:
37 : * - Tidy'd into XHTML fragments
38 : * - Image sizes are baked in
39 : * - Javascript is stripped
40 : * - Common social media buttons removed
41 : * - Tracking pixels? Nope.
42 : */
43 : class RawFeedRewriter : public FangObject
44 : {
45 : Q_OBJECT
46 : public:
47 : explicit RawFeedRewriter(QObject *parent = nullptr);
48 :
49 :
50 : signals:
51 : /**
52 : * @brief We're done! The feed you passed in as been modified.
53 : */
54 : void finished();
55 :
56 : public slots:
57 :
58 : void rewrite(QList<RawNews*>* newsList);
59 :
60 : /**
61 : * @return
62 : */
63 0 : inline QList<RawNews*>* getNewsList() { return newsList; }
64 :
65 : protected:
66 : // Returns true if the text in a node is just whitespace.
67 : bool isHTMLEmpty(QString html);
68 :
69 : // Check whether we're looking at a share button URL.
70 : bool isShareURL(const QString& url);
71 :
72 : // Turns an int into an ID.
73 : QString intToID(int id);
74 :
75 : // First pass rewriter.
76 : QString rewriteFirstPass(const QString& document, QSet<QUrl>& imageURLs);
77 :
78 : // Calls rewriteSecondPass() on all news HTML.
79 : void rewriteAllSecondPass();
80 :
81 : // Same as above, but this takes care of the images (if needed) and deletes empty elements.
82 : QString rewriteSecondPass(QString& docString);
83 :
84 : // Post-process our news list.
85 : void postProcess();
86 :
87 : // Remove headers, footers, and other garbage.
88 : void postProcessDocString(QString& docString);
89 :
90 : // Resizes image dimensions.
91 : void imageResize(int width, int height, int* newWidth, int* newHeight);
92 :
93 : // Removes excessive newlines.
94 : void removeNewlinesBothSides(QString& docString);
95 :
96 : // Some news is text instead of HTML (Hearst's papers, for example.)
97 : QString rewriteTextOnlyNews(QString input);
98 :
99 : protected slots:
100 : // We've grabbed our images.
101 : void onImageGrabberFinished();
102 :
103 :
104 : private:
105 : // The current news list.
106 : QList<RawNews*>* newsList;
107 :
108 : // Web page grabber.
109 : WebPageGrabber webPageGrabber;
110 :
111 : // Image grabber! GRAB GRAB GRAB
112 : ImageGrabber imageGrabber;
113 :
114 : // Setup.
115 : QSet<QString> tagsToRemove;
116 : QSet<QString> classesToRemove;
117 : QList<QString> shareButtonURLs;
118 : QSet<QString> containerTags;
119 :
120 : // Element IDs.
121 : QSet<QString> idsToDelete;
122 : int intID;
123 : };
124 :
125 : #endif // RAWFEEDIMAGESIZEREWRITER_H
|