LCOV - code coverage report
Current view: top level - src/utilities - HTMLSanitizer.h (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 100.0 % 6 6
Test Date: 2026-03-23 10:19:47 Functions: 100.0 % 1 1

            Line data    Source code
       1              : #ifndef HTMLSANITIZER_H
       2              : #define HTMLSANITIZER_H
       3              : 
       4              : #include <QObject>
       5              : #include <QSet>
       6              : #include <QMap>
       7              : #include <QUrl>
       8              : #include <QString>
       9              : 
      10              : #include "../FangObject.h"
      11              : #include "ImageGrabber.h"
      12              : #include "WebPageGrabber.h"
      13              : 
      14              : /*!
      15              :     \brief Represents a DOM node during HTML parsing.
      16              :  */
      17              : class DOMNode {
      18              : public:
      19          174 :     DOMNode(QString tagName, int intID) :
      20          174 :         tagName(tagName),
      21          174 :         intID(intID),
      22          174 :         nonEmptyTextCount(0),
      23          174 :         numChildren(0)
      24          174 :     {}
      25              : 
      26              :     // Stack requires a default c'tor
      27              :     DOMNode() :
      28              :         intID(0),
      29              :         nonEmptyTextCount(0),
      30              :         numChildren(0)
      31              :     {}
      32              : 
      33              :     QString tagName;
      34              :     int intID;
      35              :     int nonEmptyTextCount;
      36              :     int numChildren;
      37              : };
      38              : 
      39              : /*!
      40              :     \brief HTMLSanitizer handles HTML cleanup and XHTML conversion.
      41              : 
      42              :     This is a synchronous component that:
      43              :       - Converts HTML to XHTML via TidyLib
      44              :       - Removes unwanted tags (script, style, iframe, etc.)
      45              :       - Removes social media share buttons
      46              :       - Removes tracking pixels
      47              :       - Extracts image URLs that need dimension fetching
      48              :       - Handles two-pass processing to remove empty elements
      49              :  */
      50              : class HTMLSanitizer : public FangObject
      51              : {
      52              :     Q_OBJECT
      53              : public:
      54              :     explicit HTMLSanitizer(QObject *parent = nullptr);
      55              : 
      56              :     /*!
      57              :         \brief First pass: sanitize HTML, returning clean XHTML and extracted image URLs.
      58              :         \param html Input HTML
      59              :         \param imageURLs [out] URLs of images that need dimension fetching
      60              :         \return Sanitized XHTML with FangID attributes for second pass
      61              :      */
      62              :     QString sanitize(const QString& html, QSet<QUrl>& imageURLs);
      63              : 
      64              :     /*!
      65              :         \brief Second pass: remove marked elements and apply image data.
      66              :         \param html First-pass output
      67              :         \param imageResults Map of URL to image data (from ImageGrabber)
      68              :         \return Final cleaned HTML
      69              :      */
      70              :     QString finalize(const QString& html, const QMap<QUrl, ImageData>& imageResults);
      71              : 
      72              :     /*!
      73              :         \brief Convert text-only content to HTML paragraphs.
      74              :         \param text Plain text input
      75              :         \return HTML with paragraphs
      76              :      */
      77              :     QString textToHtml(const QString& text);
      78              : 
      79              :     /*!
      80              :         \brief Reset state for a new batch of documents.
      81              : 
      82              :         Call this before processing a new set of documents to clear
      83              :         the idsToDelete set and reset the ID counter.
      84              :      */
      85              :     void reset();
      86              : 
      87              : private:
      88              :     WebPageGrabber webPageGrabber;
      89              : 
      90              :     // Configuration
      91              :     QSet<QString> tagsToRemove;
      92              :     QSet<QString> classesToRemove;
      93              :     QList<QString> shareButtonURLs;
      94              :     QSet<QString> containerTags;
      95              : 
      96              :     // State for two-pass processing
      97              :     QSet<QString> idsToDelete;
      98              :     int currentId;
      99              : 
     100              :     // Helpers
     101              :     bool isShareURL(const QString& url);
     102              :     bool isHTMLEmpty(const QString& html);
     103              :     QString intToID(int id);
     104              :     void removeNewlinesBothSides(QString& str);
     105              :     void postProcessDocString(QString& docString);
     106              : };
     107              : 
     108              : #endif // HTMLSANITIZER_H
        

Generated by: LCOV version 2.0-1