Commit 17f74cf2f for clamav.net
commit 17f74cf2f3e21dfcdd85181685f76dda72857da2
Author: John Humlick <15677335+jhumlick@users.noreply.github.com>
Date: Wed Jan 21 09:13:01 2026 -0800
libclamav: Add PDF stats for metadata collection. (#1546)
More PDF statistics were requested for feature parity.
If metadata collection is enabled, the following additional PDF
statistics will be collected:
- Number of Automatic Actions
- Number of Streams
- Number of Objects
- Number of Object Streams
- Number of Trailers
- Number of URIs
- Number of Xrefs
Additionally, some of the parsing logic was fixed during testing of
these features.
CLAM-2820
diff --git a/libclamav/pdf.c b/libclamav/pdf.c
index 035a403d7..3752af220 100644
--- a/libclamav/pdf.c
+++ b/libclamav/pdf.c
@@ -87,6 +87,7 @@ static void pdf_export_json(struct pdf_struct *);
static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
+static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
@@ -1653,12 +1654,13 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
* is an object stream. If so, collect the relevant info.
*/
dict_len = obj->stream - start;
- if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
+ if (NULL != (pstr = pdf_getdict(start, &dict_len, "/ObjStm"))) {
int objstm_first = -1;
int objstm_length = -1;
int objstm_n = -1;
- cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
+ cli_dbgmsg("pdf_extract_obj: Found /ObjStm\n");
+ pdf->stats.nobjstream++;
dict_len = obj->stream - start;
if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
@@ -1669,14 +1671,19 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
} else {
/* Add objstm to pdf struct, so it can be freed eventually */
- pdf->nobjstms++;
- pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
+ if (!pdf->objstms) {
+ pdf->objstms = malloc(sizeof(struct objstm_struct *));
+ } else {
+ pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * (pdf->nobjstms + 1));
+ }
if (!pdf->objstms) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
status = CL_EMEM;
goto done;
}
+ pdf->nobjstms++;
+
CLI_CALLOC_OR_GOTO_DONE(
objstm, 1, sizeof(struct objstm_struct),
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms),
@@ -1953,6 +1960,7 @@ struct pdfname_action {
};
static struct pdfname_action pdfname_actions[] = {
+ {"AA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AutomaticAction_cb},
{"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
{"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
{"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
@@ -2137,7 +2145,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length)
{
const char *enc;
-
+ pdf->stats.ntrailer++;
enc = cli_memstr(s, length, "/Encrypt", 8);
if (enc) {
char *newID;
@@ -2221,6 +2229,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
if ((CL_SUCCESS == has_stream) ||
(CL_EFORMAT == has_stream)) {
/* Stream found. Store this fact and the stream bounds. */
+ pdf->stats.nstream++;
cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size);
obj->flags |= (1 << OBJ_STREAM);
obj->stream = stream;
@@ -3900,6 +3909,8 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
if (!q || xrefCheck(q, q + bytesleft) == -1) {
cli_dbgmsg("cli_pdf: did not find valid xref\n");
pdf.flags |= 1 << BAD_PDF_TRAILER;
+ } else {
+ pdf.stats.nxref++;
}
}
}
@@ -4562,26 +4573,47 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
}
}
-static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
+static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
- if (NULL == pdf)
+ cli_ctx *ctx = pdf->ctx;
+
+ if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
return;
+ }
- pdf->stats.nrichmedia++;
+ pdf->stats.nacroform++;
}
-static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
+static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
- if (NULL == pdf)
+ cli_ctx *ctx = pdf->ctx;
+
+ if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
return;
+ }
- pdf->stats.nacroform++;
+ // ToDO: Find a way to not count references to the same automatic action multiple times
+ pdf->stats.naa++;
+}
+
+static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
+{
+ UNUSEDPARAM(obj);
+ UNUSEDPARAM(act);
+
+ cli_ctx *ctx = pdf->ctx;
+
+ if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
+ return;
+ }
+
+ pdf->stats.nrichmedia++;
}
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
@@ -4589,8 +4621,11 @@ static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
- if (NULL == pdf)
+ cli_ctx *ctx = pdf->ctx;
+
+ if (!(pdf) || !(pdf->ctx->this_layer_metadata_json) || !(SCAN_COLLECT_METADATA)) {
return;
+ }
pdf->stats.nxfa++;
}
@@ -4759,6 +4794,8 @@ static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
return;
}
+ pdf->stats.nuri++;
+
if (obj->objstm) {
bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
} else {
@@ -5118,6 +5155,20 @@ static void pdf_export_json(struct pdf_struct *pdf)
cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform);
if (pdf->stats.nxfa)
cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa);
+ if (pdf->stats.naa)
+ cli_jsonint(pdfobj, "AutomaticActionCount", pdf->stats.naa);
+ if (pdf->stats.nstream)
+ cli_jsonint(pdfobj, "StreamCount", pdf->stats.nstream);
+ if (pdf->nobjs)
+ cli_jsonint(pdfobj, "ObjectCount", pdf->nobjs);
+ if (pdf->stats.nobjstream)
+ cli_jsonint(pdfobj, "ObjectStreamCount", pdf->stats.nobjstream);
+ if (pdf->stats.ntrailer)
+ cli_jsonint(pdfobj, "TrailerCount", pdf->stats.ntrailer);
+ if (pdf->stats.nuri)
+ cli_jsonint(pdfobj, "URICount", pdf->stats.nuri);
+ if (pdf->stats.nxref)
+ cli_jsonint(pdfobj, "XRefCount", pdf->stats.nxref);
if (pdf->flags & (1 << BAD_PDF_VERSION))
cli_jsonbool(pdfobj, "BadVersion", 1);
if (pdf->flags & (1 << BAD_PDF_HEADERPOS))
diff --git a/libclamav/pdf.h b/libclamav/pdf.h
index b3c928f2c..88b344749 100644
--- a/libclamav/pdf.h
+++ b/libclamav/pdf.h
@@ -102,39 +102,45 @@ struct pdf_stats_entry {
};
struct pdf_stats {
- int32_t ninvalidobjs; /* Number of invalid objects */
- int32_t njs; /* Number of javascript objects */
- int32_t nflate; /* Number of flate-encoded objects */
+ int32_t naa; /* Number of Automatic Action objects */
+ int32_t nacroform; /* Number of AcroForm objects */
int32_t nactivex; /* Number of ActiveX objects */
- int32_t nflash; /* Number of flash objects */
- int32_t ncolors; /* Number of colors */
- int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */
int32_t nascii85decode; /* Number of ASCII85Decode-filtered objects */
+ int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */
+ int32_t ncolors; /* Number of colors */
+ int32_t ncrypt; /* Number of Crypt-filtered objects */
+ int32_t ndctdecode; /* Number of DCTDecode-filtered objects */
int32_t nembeddedfile; /* Number of embedded files */
- int32_t nimage; /* Number of image objects */
- int32_t nlzw; /* Number of LZW-filtered objects */
- int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */
int32_t nfaxdecode; /* Number of CCITT-filtered objects */
+ int32_t nflash; /* Number of flash objects */
+ int32_t nflate; /* Number of flate-encoded objects */
+ int32_t nimage; /* Number of image objects */
+ int32_t ninvalidobjs; /* Number of invalid objects */
int32_t njbig2decode; /* Number of JBIG2Decode-filtered objects */
- int32_t ndctdecode; /* Number of DCTDecode-filtered objects */
int32_t njpxdecode; /* Number of JPXDecode-filtered objects */
- int32_t ncrypt; /* Number of Crypt-filtered objects */
- int32_t nstandard; /* Number of Standard-filtered objects */
- int32_t nsigned; /* Number of Signed objects */
- int32_t nopenaction; /* Number of OpenAction objects */
+ int32_t njs; /* Number of javascript objects */
int32_t nlaunch; /* Number of Launch objects */
+ int32_t nlzw; /* Number of LZW-filtered objects */
+ int32_t nobjstream; /* Number of object streams */
+ int32_t nopenaction; /* Number of OpenAction objects */
int32_t npage; /* Number of Page objects */
int32_t nrichmedia; /* Number of RichMedia objects */
- int32_t nacroform; /* Number of AcroForm objects */
+ int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */
+ int32_t nsigned; /* Number of Signed objects */
+ int32_t nstandard; /* Number of Standard-filtered objects */
+ int32_t nstream; /* Number of streams */
+ int32_t ntrailer; /* Number of trailer objects */
+ int32_t nuri; /* Number of URI objects */
int32_t nxfa; /* Number of XFA objects */
+ int32_t nxref; /* Number of xref objects */
struct pdf_stats_entry *author; /* Author of the PDF */
- struct pdf_stats_entry *creator; /* Application used to create the PDF */
- struct pdf_stats_entry *producer; /* Application used to produce the PDF */
struct pdf_stats_entry *creationdate; /* Date the PDF was created */
+ struct pdf_stats_entry *creator; /* Application used to create the PDF */
+ struct pdf_stats_entry *keywords; /* Keywords of the PDF */
struct pdf_stats_entry *modificationdate; /* Date the PDF was modified */
- struct pdf_stats_entry *title; /* Title of the PDF */
+ struct pdf_stats_entry *producer; /* Application used to produce the PDF */
struct pdf_stats_entry *subject; /* Subject of the PDF */
- struct pdf_stats_entry *keywords; /* Keywords of the PDF */
+ struct pdf_stats_entry *title; /* Title of the PDF */
};
enum enc_method {
diff --git a/unit_tests/clamscan/pdf_stats_test.py b/unit_tests/clamscan/pdf_stats_test.py
new file mode 100644
index 000000000..a40c7691e
--- /dev/null
+++ b/unit_tests/clamscan/pdf_stats_test.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+
+"""
+Run clamscan tests.
+"""
+
+import sys
+import os
+import re
+import shutil
+
+sys.path.append('../unit_tests')
+import testcase
+
+
+class TC(testcase.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ super(TC, cls).setUpClass()
+
+ @classmethod
+ def tearDownClass(cls):
+ super(TC, cls).tearDownClass()
+
+ def setUp(self):
+ super(TC, self).setUp()
+
+ def tearDown(self):
+ super(TC, self).tearDown()
+
+ # Remove scan temps directory between tests
+ if (self.path_tmp / "TD").exists():
+ shutil.rmtree(self.path_tmp / "TD")
+
+ self.verify_valgrind_log()
+
+ def test_pdf_stats(self):
+ self.step_name('Test PDF Stats')
+
+ tempdir=self.path_tmp / "TD"
+ if not os.path.isdir(tempdir):
+ os.makedirs(tempdir)
+
+ testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf'
+ command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
+ valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+ path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
+ tempdir=tempdir,
+ testfile=testfile,
+ )
+ output = self.execute_command(command)
+
+ assert output.ec == 0 # clean
+
+ expected_strings = [
+ '"JavaScriptObjectCount":1,',
+ '"EmbeddedFileCount":2,',
+ '"JBIG2DecodeCount":2,',
+ '"OpenActionCount":2,',
+ '"LaunchCount":2,',
+ '"PageCount":2,',
+ '"RichMediaCount":2,',
+ '"AcroFormCount":2,',
+ '"XFACount":2,',
+ '"AutomaticActionCount":2,',
+ '"StreamCount":7,',
+ '"ObjectCount":16,',
+ '"ObjectStreamCount":1,',
+ '"TrailerCount":1,',
+ '"XRefCount":1'
+ ]
+ self.verify_metadata_json(tempdir, expected_strings)
diff --git a/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf b/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf
new file mode 100644
index 000000000..fd85573ba
--- /dev/null
+++ b/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf
@@ -0,0 +1,167 @@
+%PDF-1.7
+1 0 obj
+<<
+ /Type /Catalog
+ /Pages 2 0 R
+ /OpenAction 5 0 R
+ /Launch 6 0 R
+ /EmbeddedFile 7 0 R
+ /AcroForm 8 0 R
+ /ObjStm 9 0 R
+ /JBIG2Decode 10 0 R
+ /RichMedia 11 0 R
+ /XFA 12 0 R
+ /AA 15 0 R
+>>
+endobj
+
+2 0 obj
+<<
+ /Type /Pages
+ /Count 2
+ /Kids [3 0 R 4 0 R]
+>>
+endobj
+
+3 0 obj
+<<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 13 0 R
+ /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >>
+>>
+endobj
+
+4 0 obj
+<<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 14 0 R
+ /OpenAction 5 0 R
+ /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >>
+>>
+endobj
+
+5 0 obj
+<<
+ /S /JavaScript
+ /JS (alert("Hello from JS"))
+>>
+endobj
+
+6 0 obj
+<<
+ /S /Launch
+ /F (dummy.exe)
+>>
+endobj
+
+7 0 obj
+<<
+ /Type /EmbeddedFile
+ /Length 11
+>>
+stream
+HelloWorld
+endstream
+endobj
+
+8 0 obj
+<<
+ /Type /AcroForm
+ /Fields []
+>>
+endobj
+
+9 0 obj
+<<
+ /Type /ObjStm
+ /N 1
+ /First 4
+ /Length 30
+>>
+stream
+17 0 << /Test /ObjStmEmbedded >>
+endstream
+endobj
+
+10 0 obj
+<<
+ /Filter /JBIG2Decode
+ /Length 9
+>>
+stream
+JBIG2DATA
+endstream
+endobj
+
+11 0 obj
+<<
+ /Type /RichMedia
+ /Length 9
+>>
+stream
+RichMedia
+endstream
+endobj
+
+12 0 obj
+<<
+ /Type /XFA
+ /Length 3
+>>
+stream
+XFA
+endstream
+endobj
+
+13 0 obj
+<<
+ /Length 37
+>>
+stream
+BT /F1 24 Tf 100 700 Td (Test 1) Tj ET
+endstream
+endobj
+
+14 0 obj
+<<
+ /Length 37
+>>
+stream
+BT /F1 24 Tf 100 700 Td (Test 2) Tj ET
+endstream
+endobj
+
+15 0 obj
+<<
+ /AA << /O 5 0 R >>
+>>
+endobj
+
+xref
+0 17
+0000000000 65535 f
+0000000009 00000 n
+0000000232 00000 n
+0000000305 00000 n
+0000000462 00000 n
+0000000639 00000 n
+0000000708 00000 n
+0000000760 00000 n
+0000000845 00000 n
+0000000899 00000 n
+0000001009 00000 n
+0000001095 00000 n
+0000001176 00000 n
+0000001245 00000 n
+0000001335 00000 n
+0000001425 00000 n
+trailer
+<<
+ /Size 17
+ /Root 1 0 R
+>>
+startxref
+1478
+%%EOF
\ No newline at end of file