Commit 17f74cf2f for clamav.net

commit 17f74cf2f3e21dfcdd85181685f76dda72857da2
Author: John Humlick <15677335+jhumlick@users.noreply.github.com>
Date:   Wed Jan 21 09:13:01 2026 -0800

    libclamav: Add PDF stats for metadata collection. (#1546)

    More PDF statistics were requested for feature parity.
    If metadata collection is enabled, the following additional PDF
    statistics will be collected:
      - Number of Automatic Actions
      - Number of Streams
      - Number of Objects
      - Number of Object Streams
      - Number of Trailers
      - Number of URIs
      - Number of Xrefs

    Additionally, some of the parsing logic was fixed during testing of
    these features.

    CLAM-2820

diff --git a/libclamav/pdf.c b/libclamav/pdf.c
index 035a403d7..3752af220 100644
--- a/libclamav/pdf.c
+++ b/libclamav/pdf.c
@@ -87,6 +87,7 @@ static void pdf_export_json(struct pdf_struct *);

 static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
 static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
+static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
 static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
 static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
 static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
@@ -1653,12 +1654,13 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
          * is an object stream. If so, collect the relevant info.
          */
         dict_len = obj->stream - start;
-        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
+        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/ObjStm"))) {
             int objstm_first  = -1;
             int objstm_length = -1;
             int objstm_n      = -1;

-            cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
+            cli_dbgmsg("pdf_extract_obj: Found /ObjStm\n");
+            pdf->stats.nobjstream++;

             dict_len = obj->stream - start;
             if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
@@ -1669,14 +1671,19 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
                 cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
             } else {
                 /* Add objstm to pdf struct, so it can be freed eventually */
-                pdf->nobjstms++;
-                pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
+                if (!pdf->objstms) {
+                    pdf->objstms = malloc(sizeof(struct objstm_struct *));
+                } else {
+                    pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * (pdf->nobjstms + 1));
+                }
                 if (!pdf->objstms) {
                     cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
                     status = CL_EMEM;
                     goto done;
                 }

+                pdf->nobjstms++;
+
                 CLI_CALLOC_OR_GOTO_DONE(
                     objstm, 1, sizeof(struct objstm_struct),
                     cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms),
@@ -1953,6 +1960,7 @@ struct pdfname_action {
 };

 static struct pdfname_action pdfname_actions[] = {
+    {"AA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AutomaticAction_cb},
     {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
     {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
     {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
@@ -2137,7 +2145,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
 static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length)
 {
     const char *enc;
-
+    pdf->stats.ntrailer++;
     enc = cli_memstr(s, length, "/Encrypt", 8);
     if (enc) {
         char *newID;
@@ -2221,6 +2229,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
         if ((CL_SUCCESS == has_stream) ||
             (CL_EFORMAT == has_stream)) {
             /* Stream found. Store this fact and the stream bounds. */
+            pdf->stats.nstream++;
             cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size);
             obj->flags |= (1 << OBJ_STREAM);
             obj->stream      = stream;
@@ -3900,6 +3909,8 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
                 if (!q || xrefCheck(q, q + bytesleft) == -1) {
                     cli_dbgmsg("cli_pdf: did not find valid xref\n");
                     pdf.flags |= 1 << BAD_PDF_TRAILER;
+                } else {
+                    pdf.stats.nxref++;
                 }
             }
         }
@@ -4562,26 +4573,47 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
     }
 }

-static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
+static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
 {
     UNUSEDPARAM(obj);
     UNUSEDPARAM(act);

-    if (NULL == pdf)
+    cli_ctx *ctx = pdf->ctx;
+
+    if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
         return;
+    }

-    pdf->stats.nrichmedia++;
+    pdf->stats.nacroform++;
 }

-static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
+static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
 {
     UNUSEDPARAM(obj);
     UNUSEDPARAM(act);

-    if (NULL == pdf)
+    cli_ctx *ctx         = pdf->ctx;
+
+    if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
         return;
+    }

-    pdf->stats.nacroform++;
+    // ToDO: Find a way to not count references to the same automatic action multiple times
+    pdf->stats.naa++;
+}
+
+static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
+{
+    UNUSEDPARAM(obj);
+    UNUSEDPARAM(act);
+
+    cli_ctx *ctx = pdf->ctx;
+
+    if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
+        return;
+    }
+
+    pdf->stats.nrichmedia++;
 }

 static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
@@ -4589,8 +4621,11 @@ static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
     UNUSEDPARAM(obj);
     UNUSEDPARAM(act);

-    if (NULL == pdf)
+    cli_ctx *ctx = pdf->ctx;
+
+    if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
         return;
+    }

     pdf->stats.nxfa++;
 }
@@ -4759,6 +4794,8 @@ static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
         return;
     }

+    pdf->stats.nuri++;
+
     if (obj->objstm) {
         bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
     } else {
@@ -5118,6 +5155,20 @@ static void pdf_export_json(struct pdf_struct *pdf)
         cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform);
     if (pdf->stats.nxfa)
         cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa);
+    if (pdf->stats.naa)
+        cli_jsonint(pdfobj, "AutomaticActionCount", pdf->stats.naa);
+    if (pdf->stats.nstream)
+        cli_jsonint(pdfobj, "StreamCount", pdf->stats.nstream);
+    if (pdf->nobjs)
+        cli_jsonint(pdfobj, "ObjectCount", pdf->nobjs);
+    if (pdf->stats.nobjstream)
+        cli_jsonint(pdfobj, "ObjectStreamCount", pdf->stats.nobjstream);
+    if (pdf->stats.ntrailer)
+        cli_jsonint(pdfobj, "TrailerCount", pdf->stats.ntrailer);
+    if (pdf->stats.nuri)
+        cli_jsonint(pdfobj, "URICount", pdf->stats.nuri);
+    if (pdf->stats.nxref)
+        cli_jsonint(pdfobj, "XRefCount", pdf->stats.nxref);
     if (pdf->flags & (1 << BAD_PDF_VERSION))
         cli_jsonbool(pdfobj, "BadVersion", 1);
     if (pdf->flags & (1 << BAD_PDF_HEADERPOS))
diff --git a/libclamav/pdf.h b/libclamav/pdf.h
index b3c928f2c..88b344749 100644
--- a/libclamav/pdf.h
+++ b/libclamav/pdf.h
@@ -102,39 +102,45 @@ struct pdf_stats_entry {
 };

 struct pdf_stats {
-    int32_t ninvalidobjs;                     /* Number of invalid objects */
-    int32_t njs;                              /* Number of javascript objects */
-    int32_t nflate;                           /* Number of flate-encoded objects */
+    int32_t naa;                              /* Number of Automatic Action objects */
+    int32_t nacroform;                        /* Number of AcroForm objects */
     int32_t nactivex;                         /* Number of ActiveX objects */
-    int32_t nflash;                           /* Number of flash objects */
-    int32_t ncolors;                          /* Number of colors */
-    int32_t nasciihexdecode;                  /* Number of ASCIIHexDecode-filtered objects */
     int32_t nascii85decode;                   /* Number of ASCII85Decode-filtered objects */
+    int32_t nasciihexdecode;                  /* Number of ASCIIHexDecode-filtered objects */
+    int32_t ncolors;                          /* Number of colors */
+    int32_t ncrypt;                           /* Number of Crypt-filtered objects */
+    int32_t ndctdecode;                       /* Number of DCTDecode-filtered objects */
     int32_t nembeddedfile;                    /* Number of embedded files */
-    int32_t nimage;                           /* Number of image objects */
-    int32_t nlzw;                             /* Number of LZW-filtered objects */
-    int32_t nrunlengthdecode;                 /* Number of RunLengthDecode-filtered objects */
     int32_t nfaxdecode;                       /* Number of CCITT-filtered objects */
+    int32_t nflash;                           /* Number of flash objects */
+    int32_t nflate;                           /* Number of flate-encoded objects */
+    int32_t nimage;                           /* Number of image objects */
+    int32_t ninvalidobjs;                     /* Number of invalid objects */
     int32_t njbig2decode;                     /* Number of JBIG2Decode-filtered objects */
-    int32_t ndctdecode;                       /* Number of DCTDecode-filtered objects */
     int32_t njpxdecode;                       /* Number of JPXDecode-filtered objects */
-    int32_t ncrypt;                           /* Number of Crypt-filtered objects */
-    int32_t nstandard;                        /* Number of Standard-filtered objects */
-    int32_t nsigned;                          /* Number of Signed objects */
-    int32_t nopenaction;                      /* Number of OpenAction objects */
+    int32_t njs;                              /* Number of javascript objects */
     int32_t nlaunch;                          /* Number of Launch objects */
+    int32_t nlzw;                             /* Number of LZW-filtered objects */
+    int32_t nobjstream;                       /* Number of object streams */
+    int32_t nopenaction;                      /* Number of OpenAction objects */
     int32_t npage;                            /* Number of Page objects */
     int32_t nrichmedia;                       /* Number of RichMedia objects */
-    int32_t nacroform;                        /* Number of AcroForm objects */
+    int32_t nrunlengthdecode;                 /* Number of RunLengthDecode-filtered objects */
+    int32_t nsigned;                          /* Number of Signed objects */
+    int32_t nstandard;                        /* Number of Standard-filtered objects */
+    int32_t nstream;                          /* Number of streams */
+    int32_t ntrailer;                         /* Number of trailer objects */
+    int32_t nuri;                             /* Number of URI objects */
     int32_t nxfa;                             /* Number of XFA objects */
+    int32_t nxref;                            /* Number of xref objects */
     struct pdf_stats_entry *author;           /* Author of the PDF */
-    struct pdf_stats_entry *creator;          /* Application used to create the PDF */
-    struct pdf_stats_entry *producer;         /* Application used to produce the PDF */
     struct pdf_stats_entry *creationdate;     /* Date the PDF was created */
+    struct pdf_stats_entry *creator;          /* Application used to create the PDF */
+    struct pdf_stats_entry *keywords;         /* Keywords of the PDF */
     struct pdf_stats_entry *modificationdate; /* Date the PDF was modified */
-    struct pdf_stats_entry *title;            /* Title of the PDF */
+    struct pdf_stats_entry *producer;         /* Application used to produce the PDF */
     struct pdf_stats_entry *subject;          /* Subject of the PDF */
-    struct pdf_stats_entry *keywords;         /* Keywords of the PDF */
+    struct pdf_stats_entry *title;            /* Title of the PDF */
 };

 enum enc_method {
diff --git a/unit_tests/clamscan/pdf_stats_test.py b/unit_tests/clamscan/pdf_stats_test.py
new file mode 100644
index 000000000..a40c7691e
--- /dev/null
+++ b/unit_tests/clamscan/pdf_stats_test.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+
+"""
+Run clamscan tests.
+"""
+
+import sys
+import os
+import re
+import shutil
+
+sys.path.append('../unit_tests')
+import testcase
+
+
+class TC(testcase.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super(TC, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TC, cls).tearDownClass()
+
+    def setUp(self):
+        super(TC, self).setUp()
+
+    def tearDown(self):
+        super(TC, self).tearDown()
+
+        # Remove scan temps directory between tests
+        if (self.path_tmp / "TD").exists():
+            shutil.rmtree(self.path_tmp / "TD")
+
+        self.verify_valgrind_log()
+
+    def test_pdf_stats(self):
+        self.step_name('Test PDF Stats')
+
+        tempdir=self.path_tmp / "TD"
+        if not os.path.isdir(tempdir):
+            os.makedirs(tempdir)
+
+        testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf'
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+            path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
+            tempdir=tempdir,
+            testfile=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 0  # clean
+
+        expected_strings = [
+            '"JavaScriptObjectCount":1,',
+            '"EmbeddedFileCount":2,',
+            '"JBIG2DecodeCount":2,',
+            '"OpenActionCount":2,',
+            '"LaunchCount":2,',
+            '"PageCount":2,',
+            '"RichMediaCount":2,',
+            '"AcroFormCount":2,',
+            '"XFACount":2,',
+            '"AutomaticActionCount":2,',
+            '"StreamCount":7,',
+            '"ObjectCount":16,',
+            '"ObjectStreamCount":1,',
+            '"TrailerCount":1,',
+            '"XRefCount":1'
+        ]
+        self.verify_metadata_json(tempdir, expected_strings)
diff --git a/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf b/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf
new file mode 100644
index 000000000..fd85573ba
--- /dev/null
+++ b/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf
@@ -0,0 +1,167 @@
+%PDF-1.7
+1 0 obj
+<<
+  /Type /Catalog
+  /Pages 2 0 R
+  /OpenAction 5 0 R
+  /Launch 6 0 R
+  /EmbeddedFile 7 0 R
+  /AcroForm 8 0 R
+  /ObjStm 9 0 R
+  /JBIG2Decode 10 0 R
+  /RichMedia 11 0 R
+  /XFA 12 0 R
+  /AA 15 0 R
+>>
+endobj
+
+2 0 obj
+<<
+  /Type /Pages
+  /Count 2
+  /Kids [3 0 R 4 0 R]
+>>
+endobj
+
+3 0 obj
+<<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 13 0 R
+  /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >>
+>>
+endobj
+
+4 0 obj
+<<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 14 0 R
+  /OpenAction 5 0 R
+  /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >>
+>>
+endobj
+
+5 0 obj
+<<
+  /S /JavaScript
+  /JS (alert("Hello from JS"))
+>>
+endobj
+
+6 0 obj
+<<
+  /S /Launch
+  /F (dummy.exe)
+>>
+endobj
+
+7 0 obj
+<<
+  /Type /EmbeddedFile
+  /Length 11
+>>
+stream
+HelloWorld
+endstream
+endobj
+
+8 0 obj
+<<
+  /Type /AcroForm
+  /Fields []
+>>
+endobj
+
+9 0 obj
+<<
+  /Type /ObjStm
+  /N 1
+  /First 4
+  /Length 30
+>>
+stream
+17 0 << /Test /ObjStmEmbedded >>
+endstream
+endobj
+
+10 0 obj
+<<
+  /Filter /JBIG2Decode
+  /Length 9
+>>
+stream
+JBIG2DATA
+endstream
+endobj
+
+11 0 obj
+<<
+  /Type /RichMedia
+  /Length 9
+>>
+stream
+RichMedia
+endstream
+endobj
+
+12 0 obj
+<<
+  /Type /XFA
+  /Length 3
+>>
+stream
+XFA
+endstream
+endobj
+
+13 0 obj
+<<
+  /Length 37
+>>
+stream
+BT /F1 24 Tf 100 700 Td (Test 1) Tj ET
+endstream
+endobj
+
+14 0 obj
+<<
+  /Length 37
+>>
+stream
+BT /F1 24 Tf 100 700 Td (Test 2) Tj ET
+endstream
+endobj
+
+15 0 obj
+<<
+  /AA << /O 5 0 R >>
+>>
+endobj
+
+xref
+0 17
+0000000000 65535 f
+0000000009 00000 n
+0000000232 00000 n
+0000000305 00000 n
+0000000462 00000 n
+0000000639 00000 n
+0000000708 00000 n
+0000000760 00000 n
+0000000845 00000 n
+0000000899 00000 n
+0000001009 00000 n
+0000001095 00000 n
+0000001176 00000 n
+0000001245 00000 n
+0000001335 00000 n
+0000001425 00000 n
+trailer
+<<
+  /Size 17
+  /Root 1 0 R
+>>
+startxref
+1478
+%%EOF
\ No newline at end of file