Commit 67a02aa23 for clamav.net

commit 67a02aa2352c5628d261cca97244bac6ba9ab461
Author: Val S. <valsnyde@cisco.com>
Date:   Wed Jul 1 18:43:46 2026 -0400

    Libclamav: tighten HTML filetype signatures (#1734)

    Require a tag boundary after built-in <table and <TABLE HTML filetype signatures so OOXML elements such as <tableStyles> do not retype XML content as HTML.

    Match the ASCII whitespace bytes accepted by htmlnorm's isspace() handling, and add clamscan coverage for the false positive and accepted table tag boundaries.

    The built-in HTML FTM signatures for iframe, img, object, and script
    only matched the start of the tag name. That allowed longer XML or
    custom element names, such as <scriptlet> or <objectId>, to retype
    otherwise textual content as HTML. This is the same class of false
    positive fixed for <table>.

    Require an HTML tag boundary after those tag names using the same
    byte set used for the table signature: '>', '/', and the ASCII
    whitespace bytes accepted by htmlnorm. Broaden the clamscan
    regression to cover the adjacent false-positive prefixes and the
    accepted real tag boundary forms.

    Validation:

    - cmake --build clamav/build-asan-clamscan --target clamscan -j12

    - VERSION=test SOURCE=/Users/val/dev/clamav-dev-workspace/clamav
      BUILD=/Users/val/dev/clamav-dev-workspace/clamav/build-asan-clamscan
      TMP=/private/tmp
      CLAMSCAN=/Users/val/dev/clamav-dev-workspace/clamav/build-asan-clamscan/clamscan/clamscan
      python3 -m pytest -q
      clamscan/hash_and_file_type_test.py::TC::test_html_file_type_tag_signatures_require_tag_boundary

    - git -C clamav diff --check

    CLAM-3007

diff --git a/libclamav/filetypes_int.h b/libclamav/filetypes_int.h
index 414d9033b..34b4f7406 100644
--- a/libclamav/filetypes_int.h
+++ b/libclamav/filetypes_int.h
@@ -90,23 +90,23 @@ static const char *ftypes_int[] = {
     "1:*:3c48544d4c3e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
     "1:*:3c486561643e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
     "1:*:3c48746d6c3e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c494652414d45:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c494d47:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c496d67:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c4f424a454354:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c4f626a656374:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c534352495054:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c536372697074:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c5441424c45:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c494652414d45(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c494d47(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c496d67(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c4f424a454354(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c4f626a656374(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c534352495054(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c536372697074(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c5441424c45(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
     "1:*:3c6120*(68|48)(72|52)4546:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
     "1:*:3c6120*(68|48)(72|52)6566:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
     "1:*:3c686561643e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
     "1:*:3c68746d6c3e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c696672616d65:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c696d67:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c6f626a656374:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c736372697074:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
-    "1:*:3c7461626c65:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c696672616d65(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c696d67(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c6f626a656374(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c736372697074(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+    "1:*:3c7461626c65(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
     "1:*:4d5a{60-300}50450000:PE:CL_TYPE_ANY:CL_TYPE_MSEXE",
     "1:*:504b0304:ZIP-SFX:CL_TYPE_ANY:CL_TYPE_ZIPSFX",
     "1:*:526172211a0700:RAR-SFX:CL_TYPE_ANY:CL_TYPE_RARSFX",
diff --git a/unit_tests/clamscan/hash_and_file_type_test.py b/unit_tests/clamscan/hash_and_file_type_test.py
index cf2711c86..b016450b0 100644
--- a/unit_tests/clamscan/hash_and_file_type_test.py
+++ b/unit_tests/clamscan/hash_and_file_type_test.py
@@ -269,3 +269,97 @@ class TC(testcase.TestCase):
             'logo.png FileType: faketype',
         ]
         self.verify_output(output.out, expected=expected_stdout, unexpected=unexpected_stdout)
+
+
+    def test_html_file_type_tag_signatures_require_tag_boundary(self):
+        self.step_name('Test that HTML file type tag signatures require a tag boundary.')
+
+        (TC.path_tmp / 'good.ldb').write_text(
+            "logo.png.good;Engine:150-255,Target:0;0;fuzzy_img#af2ad01ed42993c7#0\n"
+        )
+
+        def check_file_type(filename, contents, expected_type, unexpected_types=None):
+            if unexpected_types is None:
+                unexpected_types = []
+
+            testfile = TC.path_tmp / filename
+            testfile.write_bytes(contents)
+
+            command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfile} --log-file-type'.format(
+                valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+                path_db=TC.path_tmp / 'good.ldb',
+                testfile=testfile,
+            )
+            output = self.execute_command(command)
+
+            assert output.ec == 0  # clean
+
+            self.verify_output(
+                output.out,
+                expected=[
+                    '{}: OK'.format(filename),
+                    '{} FileType: {}'.format(filename, expected_type),
+                ],
+                unexpected=[
+                    '{} FileType: {}'.format(filename, file_type)
+                    for file_type in unexpected_types
+                ]
+            )
+
+        non_html_xml_files = {
+            'iframe_prefix.xml': b'<?xml version="1.0"?><root><iframeView/></root>',
+            'iframe_upper_prefix.xml': b'<?xml version="1.0"?><root><IFRAMEVIEW/></root>',
+            'img_prefix.xml': b'<?xml version="1.0"?><root><imgData/></root>',
+            'img_mixed_prefix.xml': b'<?xml version="1.0"?><root><ImgData/></root>',
+            'img_upper_prefix.xml': b'<?xml version="1.0"?><root><IMGDATA/></root>',
+            'object_prefix.xml': b'<?xml version="1.0"?><root><objectId/></root>',
+            'object_mixed_prefix.xml': b'<?xml version="1.0"?><root><ObjectId/></root>',
+            'object_upper_prefix.xml': b'<?xml version="1.0"?><root><OBJECTID/></root>',
+            'script_prefix.xml': b'<?xml version="1.0"?><root><scriptlet/></root>',
+            'script_mixed_prefix.xml': b'<?xml version="1.0"?><root><Scriptlet/></root>',
+            'script_upper_prefix.xml': b'<?xml version="1.0"?><root><SCRIPTLET/></root>',
+            'table_styles.xml': (
+                b'<?xml version="1.0" encoding="UTF-8"?>'
+                b'<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">'
+                b'<dxfs count="0"/><tableStyles count="0" defaultTableStyle="TableStyleMedium2"/>'
+                b'</styleSheet>'
+            ),
+            'table_upper_prefix.xml': b'<?xml version="1.0"?><root><TABLESTYLES/></root>',
+        }
+
+        for filename, contents in non_html_xml_files.items():
+            check_file_type(filename, contents, 'CL_TYPE_TEXT_ASCII', ['CL_TYPE_HTML'])
+
+        html_tag_files = {
+            'iframe_close.html': b'<iframe></iframe>',
+            'iframe_space.html': b'<iframe src="sample"></iframe>',
+            'iframe_slash.html': b'<iframe/>',
+            'iframe_uppercase.html': b'<IFRAME></IFRAME>',
+            'img_close.html': b'<img>sample',
+            'img_space.html': b'<img src="sample">',
+            'img_slash.html': b'<img/>',
+            'img_mixedcase.html': b'<Img>sample',
+            'img_uppercase.html': b'<IMG>sample',
+            'object_close.html': b'<object></object>',
+            'object_space.html': b'<object data="sample"></object>',
+            'object_slash.html': b'<object/>',
+            'object_mixedcase.html': b'<Object></Object>',
+            'object_uppercase.html': b'<OBJECT></OBJECT>',
+            'script_close.html': b'<script></script>',
+            'script_space.html': b'<script type="text/javascript"></script>',
+            'script_slash.html': b'<script/>',
+            'script_mixedcase.html': b'<Script></Script>',
+            'script_uppercase.html': b'<SCRIPT></SCRIPT>',
+            'table_close.html': b'<table></table>',
+            'table_space.html': b'<table class="sample"></table>',
+            'table_tab.html': b'<table\tclass="sample"></table>',
+            'table_lf.html': b'<table\nclass="sample"></table>',
+            'table_vtab.html': b'<table\vclass="sample"></table>',
+            'table_ff.html': b'<table\fclass="sample"></table>',
+            'table_cr.html': b'<table\rclass="sample"></table>',
+            'table_slash.html': b'<table/>',
+            'table_uppercase.html': b'<TABLE></TABLE>',
+        }
+
+        for filename, contents in html_tag_files.items():
+            check_file_type(filename, contents, 'CL_TYPE_HTML')