Commit d2fa46d76 for clamav.net

commit d2fa46d76cd490b553b956f7861a7e368b589f73
Author: Val S. <micasnyd@cisco.com>
Date:   Mon Jun 30 10:47:02 2025 -0400

    Fix integer overflow in PDF parser (#1523)

    The ascii85decode function calculates the amount of memory to reserve as
    a function of (4 * bytes) + 1. Since the result is stored in a uint32_t,
    we need to make sure that this calculation will not overflow. If we
    detect that an overflow would occur, return CL_EFORMAT and do not
    proceed.

    Also check additional potential overflow conditions.
    Other areas were identified that could potentially overflow.
    This commit adds additional checks to prevent said overflows.

    Thank you Greg Walkup at Sandia National Labs for reporting this issue.

    CLAM-2752
    CLAM-2757
    CLAM-2759

    Co-authored-by: John Humlick <15677335+jhumlick@users.noreply.github.com>

diff --git a/libclamav/pdf.c b/libclamav/pdf.c
index adcc42351..a56ddd24b 100644
--- a/libclamav/pdf.c
+++ b/libclamav/pdf.c
@@ -441,7 +441,7 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,

         if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
             /* Failed to find obj offset for next obj */
-            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
+            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%zu} more.\n", objstm->n - objstm->nobjs_found);
             status = CL_EPARSE;
             goto done;
         } else if (temp_long < 0) {
@@ -1585,18 +1585,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
                 }
             }

-            cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
+            cli_dbgmsg("pdf_extract_obj: calculated length %zu\n", length);
         } else {
             if (obj->stream_size > (size_t)length + 2) {
                 cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
-                           (size_t)length, obj->stream_size);
+                           length, obj->stream_size);
                 length = obj->stream_size;
             }
         }

-        if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) {
-            cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
-                       (long long)orig_length, (long long)length, obj->stream_size);
+        if ((0 != orig_length) && (obj->stream_size > orig_length + 20)) {
+            cli_dbgmsg("pdf_extract_obj: orig length: %zu, length: %zu, size: %zu\n",
+                       orig_length, length, obj->stream_size);
             pdfobj_flag(pdf, obj, BAD_STREAMLEN);
         }

@@ -1653,18 +1653,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
          */
         dict_len = obj->stream - start;
         if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
-            int32_t objstm_first  = -1;
-            int32_t objstm_length = -1;
-            int32_t objstm_n      = -1;
+            int objstm_first  = -1;
+            int objstm_length = -1;
+            int objstm_n      = -1;

             cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");

             dict_len = obj->stream - start;
-            if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) {
+            if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
                 cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
-            } else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) {
+            } else if (-1 == (objstm_length = pdf_readint(start, dict_len, "/Length"))) {
                 cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
-            } else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) {
+            } else if (-1 == (objstm_n = pdf_readint(start, dict_len, "/N"))) {
                 cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
             } else {
                 /* Add objstm to pdf struct, so it can be freed eventually */
@@ -1686,15 +1686,15 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t

                 memset(objstm, 0, sizeof(*objstm));

-                objstm->first        = (uint32_t)objstm_first;
-                objstm->current      = (uint32_t)objstm_first;
+                objstm->first        = (size_t)objstm_first;
+                objstm->current      = (size_t)objstm_first;
                 objstm->current_pair = 0;
-                objstm->length       = (uint32_t)objstm_length;
-                objstm->n            = (uint32_t)objstm_n;
+                objstm->length       = (size_t)objstm_length;
+                objstm->n            = (size_t)objstm_n;

-                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
-                cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
-                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
+                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %zu\n", objstm->first);
+                cli_dbgmsg("pdf_extract_obj: ObjStm length is %zu bytes\n", objstm->length);
+                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %zu objects\n", objstm->n);
             }
         }

@@ -3600,7 +3600,7 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs
         retval = pdf_findobj_in_objstm(pdf, objstm, &obj);
         if (retval != CL_SUCCESS) {
             if (retval != CL_BREAK) {
-                cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n",
+                cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %zu found, %zu expected.\n",
                            objstm->nobjs_found, objstm->n);
                 badobjects++;
                 pdf->stats.ninvalidobjs++;
diff --git a/libclamav/pdf.h b/libclamav/pdf.h
index f27a70f99..b3c928f2c 100644
--- a/libclamav/pdf.h
+++ b/libclamav/pdf.h
@@ -27,14 +27,14 @@
 #define PDF_OBJECT_RECURSION_LIMIT 25

 struct objstm_struct {
-    uint32_t first;        // offset of first obj
-    uint32_t current;      // offset of current obj
-    uint32_t current_pair; // offset of current pair describing id, location of object
-    uint32_t length;       // total length of all objects (starting at first)
-    uint32_t n;            // number of objects that should be found in the object stream
-    uint32_t nobjs_found;  // number of objects actually found in the object stream
-    char *streambuf;       // address of stream buffer, beginning with first obj pair
-    size_t streambuf_len;  // length of stream buffer, includes pairs followed by actual objects
+    size_t first;         // offset of first obj
+    size_t current;       // offset of current obj
+    size_t current_pair;  // offset of current pair describing id, location of object
+    size_t length;        // total length of all objects (starting at first)
+    size_t n;             // number of objects that should be found in the object stream
+    size_t nobjs_found;   // number of objects actually found in the object stream
+    char *streambuf;      // address of stream buffer, beginning with first obj pair
+    size_t streambuf_len; // length of stream buffer, includes pairs followed by actual objects
 };

 struct pdf_obj {
diff --git a/libclamav/pdfdecode.c b/libclamav/pdfdecode.c
index ca8be3f48..10649e456 100644
--- a/libclamav/pdfdecode.c
+++ b/libclamav/pdfdecode.c
@@ -73,7 +73,7 @@
 struct pdf_token {
     uint32_t flags;   /* tracking flags */
     uint32_t success; /* successfully decoded filters */
-    uint32_t length;  /* length of current content; TODO: transition to size_t */
+    size_t length;    /* length of current content; TODO: transition to size_t */
     uint8_t *content; /* content stream */
 };

@@ -401,10 +401,16 @@ static cl_error_t filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *o
     uint32_t declen = 0;

     const uint8_t *ptr = (uint8_t *)token->content;
-    uint32_t remaining = token->length;
+    size_t remaining   = token->length;
     int quintet = 0, rc = CL_SUCCESS;
     uint64_t sum = 0;

+    /* Check for overflow */
+    if (remaining > (SIZE_MAX / 4)) {
+        cli_dbgmsg("cli_pdf: ascii85decode: overflow detected\n");
+        return CL_EFORMAT;
+    }
+
     /* 5:4 decoding ratio, with 1:4 expansion sequences => (4*length)+1 */
     if (!(dptr = decoded = (uint8_t *)cli_max_malloc((4 * remaining) + 1))) {
         cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
@@ -791,8 +797,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
     uint8_t *decoded;

     const uint8_t *content = (uint8_t *)token->content;
-    uint32_t length        = token->length;
-    uint32_t i, j;
+    size_t length          = token->length;
+    size_t i, j;
     cl_error_t rc = CL_SUCCESS;

     if (!(decoded = (uint8_t *)cli_max_calloc(length / 2 + 1, sizeof(uint8_t)))) {
@@ -822,8 +828,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
     if (rc == CL_SUCCESS) {
         free(token->content);

-        cli_dbgmsg("cli_pdf: deflated %lu bytes from %lu total bytes\n",
-                   (unsigned long)j, (unsigned long)(token->length));
+        cli_dbgmsg("cli_pdf: deflated %zu bytes from %zu total bytes\n",
+                   j, token->length);

         token->content = decoded;
         token->length  = j;
@@ -831,8 +837,8 @@ static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *
         if (!(obj->flags & ((1 << OBJ_IMAGE) | (1 << OBJ_TRUNCATED))))
             pdfobj_flag(pdf, obj, BAD_ASCIIDECODE);

-        cli_dbgmsg("cli_pdf: error occurred parsing byte %lu of %lu\n",
-                   (unsigned long)i, (unsigned long)(token->length));
+        cli_dbgmsg("cli_pdf: error occurred parsing byte %zu of %zu\n",
+                   i, token->length);
         free(decoded);
     }
     return rc;
@@ -873,27 +879,29 @@ static cl_error_t filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, st
         return CL_EPARSE; /* TODO: what should this value be? CL_SUCCESS would mirror previous behavior */
     }

-    cli_dbgmsg("cli_pdf: decrypted %zu bytes from %u total bytes\n",
+    cli_dbgmsg("cli_pdf: decrypted %zu bytes from %zu total bytes\n",
                length, token->length);

     free(token->content);
     token->content = (uint8_t *)decrypted;
-    token->length  = (uint32_t)length; /* this may truncate unfortunately, TODO: use 64-bit values internally? */
+    token->length  = length;
     return CL_SUCCESS;
 }

 static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
 {
     uint8_t *decoded, *temp;
-    uint32_t declen = 0, capacity = 0;
+    size_t declen = 0, capacity = 0;

     uint8_t *content = (uint8_t *)token->content;
     uint32_t length  = token->length;
     lzw_stream stream;
     int echg = 1, lzwstat, rc = CL_SUCCESS;

-    if (pdf->ctx && !(pdf->ctx->dconf->other & OTHER_CONF_LZW))
-        return CL_BREAK;
+    if (pdf->ctx && !(pdf->ctx->dconf->other & OTHER_CONF_LZW)) {
+        rc = CL_BREAK;
+        goto done;
+    }

     if (params) {
         struct pdf_dict_node *node = params->nodes;
@@ -924,15 +932,18 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
          * Sample 0015315109, it has \r followed by zlib header.
          * Flag pdf as suspicious, and attempt to extract by skipping the \r.
          */
-        if (!length)
-            return CL_SUCCESS;
+        if (!length) {
+            rc = CL_SUCCESS;
+            goto done;
+        }
     }

     capacity = INFLATE_CHUNK_SIZE;

     if (!(decoded = (uint8_t *)malloc(capacity))) {
         cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
-        return CL_EMEM;
+        rc = CL_EMEM;
+        goto done;
     }

     memset(&stream, 0, sizeof(stream));
@@ -947,7 +958,8 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
     if (lzwstat != Z_OK) {
         cli_warnmsg("cli_pdf: lzwInit failed\n");
         free(decoded);
-        return CL_EMEM;
+        rc = CL_EMEM;
+        goto done;
     }

     /* initial inflate */
@@ -962,16 +974,23 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
             length -= q - content;
             content = q;

-            stream.next_in   = (Bytef *)content;
-            stream.avail_in  = length;
-            stream.next_out  = (Bytef *)decoded;
+            stream.next_in  = (Bytef *)content;
+            stream.avail_in = length;
+            stream.next_out = (Bytef *)decoded;
+            /* Make sure we don't overflow during type conversion */
+            if (capacity > UINT_MAX) {
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+                rc = CL_EFORMAT;
+                goto done;
+            }
             stream.avail_out = capacity;

             lzwstat = lzwInit(&stream);
             if (lzwstat != Z_OK) {
                 cli_warnmsg("cli_pdf: lzwInit failed\n");
                 free(decoded);
-                return CL_EMEM;
+                rc = CL_EMEM;
+                goto done;
             }

             pdfobj_flag(pdf, obj, BAD_FLATESTART);
@@ -984,7 +1003,7 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
         /* extend output capacity if needed,*/
         if (stream.avail_out == 0) {
             if ((rc = cli_checklimits("pdf", pdf->ctx, capacity + INFLATE_CHUNK_SIZE, 0, 0)) != CL_SUCCESS) {
-                cli_dbgmsg("cli_pdf: required buffer size to inflate compressed filter exceeds maximum: %u\n", capacity + INFLATE_CHUNK_SIZE);
+                cli_dbgmsg("cli_pdf: required buffer size to inflate compressed filter exceeds maximum: %zu\n", capacity + INFLATE_CHUNK_SIZE);
                 break;
             }

@@ -996,7 +1015,17 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
             decoded          = temp;
             stream.next_out  = decoded + capacity;
             stream.avail_out = INFLATE_CHUNK_SIZE;
+            if (declen > (SIZE_MAX - INFLATE_CHUNK_SIZE)) {
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+                rc = CL_EFORMAT;
+                goto done;
+            }
             declen += INFLATE_CHUNK_SIZE;
+            if (capacity > (SIZE_MAX - INFLATE_CHUNK_SIZE)) {
+                cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+                rc = CL_EFORMAT;
+                goto done;
+            }
             capacity += INFLATE_CHUNK_SIZE;
         }

@@ -1004,6 +1033,12 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,
         lzwstat = lzwInflate(&stream);
     }

+    if (declen > (UINT32_MAX - (INFLATE_CHUNK_SIZE - stream.avail_out))) {
+        cli_dbgmsg("cli_pdf: lzwdecode: overflow detected\n");
+        rc = CL_EFORMAT;
+        goto done;
+    }
+
     /* add stream end fragment to decoded length */
     declen += (INFLATE_CHUNK_SIZE - stream.avail_out);

@@ -1044,6 +1079,7 @@ static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj,

     (void)lzwInflateEnd(&stream);

+done:
     if (rc == CL_SUCCESS) {
         if (declen == 0) {
             cli_dbgmsg("cli_pdf: empty stream after inflation completed.\n");