Line data Source code
1 : #ifndef WEBPAGEGRABBER_H
2 : #define WEBPAGEGRABBER_H
3 :
4 : #include <QObject>
5 : #include <QString>
6 : #include <QUrl>
7 : #include <QTimer>
8 : class QWebDownload;
9 : class QNetworkAccessManager;
10 :
11 : /*!
12 : \brief Loads a web page at a given URL and signals with the XHTML document when done.
13 :
14 : Note that this class is not rentrant.
15 : */
16 : class WebPageGrabber : public QObject
17 : {
18 : Q_OBJECT
19 : public:
20 : /*!
21 : \brief WebPageGrabber creates an XHTML document from either a string or a URL.
22 : \param handleMetaRefresh If true, handles refreshes from within HTML documents rather than
23 : just HTTP communication.
24 : \param timeoutMS Timeout after last download activity in milliseconds
25 : \param parent
26 : \param networkManager Optional network manager for dependency injection (for testing)
27 : */
28 : explicit WebPageGrabber(bool handleMetaRefresh = defaultHandleMetaRefresh,
29 : int timeoutMS = defaultTimeoutMs,
30 : QObject *parent = nullptr,
31 : QNetworkAccessManager* networkManager = nullptr);
32 :
33 : /*!
34 : * \brief Just like the above but with all the defaults specified *except* for parent.
35 : */
36 : explicit WebPageGrabber(QObject *parent);
37 :
38 : /*!
39 : \brief Convert raw HTML bytes to XHTML via TidyLib.
40 : \return The XHTML string, or empty string on failure.
41 : */
42 : static QString htmlToXhtml(const QByteArray& html);
43 :
44 : signals:
45 : // If you requested a URL, ready() will be emitted when it's ready!
46 : // If document is null, an error happened. :(
47 : void ready(WebPageGrabber* grabber, QString* document);
48 :
49 : public slots:
50 : // Fetches the webpage and emits ready() with the XHTML document.
51 : // Signals with null on an error.
52 : virtual void load(const QUrl &url);
53 :
54 : // Load the HTML string into a Tidy'd XHTML document and returns it (no signal is emmitted.)
55 : // Returns null on an error.
56 : QString* load(const QString& htmlString);
57 :
58 : // Returns the previously loaded document, or the empty string if there was an error.
59 3 : inline QString* getDocument() { return error ? nullptr : &document; }
60 :
61 0 : inline QUrl getOriginalURL() const { return originalUrl; }
62 :
63 0 : inline bool isDone() const { return done; }
64 :
65 : private slots:
66 : // Internal load methods.
67 : void loadInternal(const QUrl &url);
68 : QString* loadInternal(const QString& htmlString, bool handleRefresh);
69 :
70 : // Uh oh, an error!
71 : void onDownloadError(const QUrl& url, const QString& errorString);
72 :
73 : // We got some HTTP content!
74 : void onDownloadFinished(const QUrl& url, const QByteArray& data);
75 :
76 : // Searches the XHTML'd document for a redirect URL.
77 : // Returns the redirect URL, or the empty string.
78 : QString searchForRedirect(const QString& document);
79 :
80 : // Handles the boilerplate involved with emitting the ready() signal.
81 : void emitReadySignal(QString* document);
82 :
83 : private:
84 : static constexpr bool defaultHandleMetaRefresh = true;
85 : static constexpr int defaultTimeoutMs = 5000;
86 : static constexpr int maxRedirects = 10;
87 :
88 : void init();
89 :
90 : QWebDownload* core;
91 : QString document;
92 : bool handleMetaRefresh;
93 : int redirectAttempts;
94 : bool error;
95 : QUrl originalUrl;
96 : bool done;
97 : };
98 :
99 : #endif // WEBPAGEGRABBER_H
|