Commit 67a02aa23 for clamav.net
commit 67a02aa2352c5628d261cca97244bac6ba9ab461
Author: Val S. <valsnyde@cisco.com>
Date: Wed Jul 1 18:43:46 2026 -0400
Libclamav: tighten HTML filetype signatures (#1734)
Require a tag boundary after built-in <table and <TABLE HTML filetype signatures so OOXML elements such as <tableStyles> do not retype XML content as HTML.
Match the ASCII whitespace bytes accepted by htmlnorm's isspace() handling, and add clamscan coverage for the false positive and accepted table tag boundaries.
The built-in HTML FTM signatures for iframe, img, object, and script
only matched the start of the tag name. That allowed longer XML or
custom element names, such as <scriptlet> or <objectId>, to retype
otherwise textual content as HTML. This is the same class of false
positive fixed for <table>.
Require an HTML tag boundary after those tag names using the same
byte set used for the table signature: '>', '/', and the ASCII
whitespace bytes accepted by htmlnorm. Broaden the clamscan
regression to cover the adjacent false-positive prefixes and the
accepted real tag boundary forms.
Validation:
- cmake --build clamav/build-asan-clamscan --target clamscan -j12
- VERSION=test SOURCE=/Users/val/dev/clamav-dev-workspace/clamav
BUILD=/Users/val/dev/clamav-dev-workspace/clamav/build-asan-clamscan
TMP=/private/tmp
CLAMSCAN=/Users/val/dev/clamav-dev-workspace/clamav/build-asan-clamscan/clamscan/clamscan
python3 -m pytest -q
clamscan/hash_and_file_type_test.py::TC::test_html_file_type_tag_signatures_require_tag_boundary
- git -C clamav diff --check
CLAM-3007
diff --git a/libclamav/filetypes_int.h b/libclamav/filetypes_int.h
index 414d9033b..34b4f7406 100644
--- a/libclamav/filetypes_int.h
+++ b/libclamav/filetypes_int.h
@@ -90,23 +90,23 @@ static const char *ftypes_int[] = {
"1:*:3c48544d4c3e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
"1:*:3c486561643e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
"1:*:3c48746d6c3e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c494652414d45:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c494d47:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c496d67:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c4f424a454354:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c4f626a656374:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c534352495054:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c536372697074:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c5441424c45:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c494652414d45(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c494d47(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c496d67(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c4f424a454354(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c4f626a656374(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c534352495054(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c536372697074(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c5441424c45(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
"1:*:3c6120*(68|48)(72|52)4546:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
"1:*:3c6120*(68|48)(72|52)6566:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
"1:*:3c686561643e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
"1:*:3c68746d6c3e:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c696672616d65:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c696d67:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c6f626a656374:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c736372697074:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
- "1:*:3c7461626c65:HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c696672616d65(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c696d67(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c6f626a656374(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c736372697074(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
+ "1:*:3c7461626c65(3e|20|09|0a|0b|0c|0d|2f):HTML data:CL_TYPE_ANY:CL_TYPE_HTML",
"1:*:4d5a{60-300}50450000:PE:CL_TYPE_ANY:CL_TYPE_MSEXE",
"1:*:504b0304:ZIP-SFX:CL_TYPE_ANY:CL_TYPE_ZIPSFX",
"1:*:526172211a0700:RAR-SFX:CL_TYPE_ANY:CL_TYPE_RARSFX",
diff --git a/unit_tests/clamscan/hash_and_file_type_test.py b/unit_tests/clamscan/hash_and_file_type_test.py
index cf2711c86..b016450b0 100644
--- a/unit_tests/clamscan/hash_and_file_type_test.py
+++ b/unit_tests/clamscan/hash_and_file_type_test.py
@@ -269,3 +269,97 @@ class TC(testcase.TestCase):
'logo.png FileType: faketype',
]
self.verify_output(output.out, expected=expected_stdout, unexpected=unexpected_stdout)
+
+
+ def test_html_file_type_tag_signatures_require_tag_boundary(self):
+ self.step_name('Test that HTML file type tag signatures require a tag boundary.')
+
+ (TC.path_tmp / 'good.ldb').write_text(
+ "logo.png.good;Engine:150-255,Target:0;0;fuzzy_img#af2ad01ed42993c7#0\n"
+ )
+
+ def check_file_type(filename, contents, expected_type, unexpected_types=None):
+ if unexpected_types is None:
+ unexpected_types = []
+
+ testfile = TC.path_tmp / filename
+ testfile.write_bytes(contents)
+
+ command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfile} --log-file-type'.format(
+ valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+ path_db=TC.path_tmp / 'good.ldb',
+ testfile=testfile,
+ )
+ output = self.execute_command(command)
+
+ assert output.ec == 0 # clean
+
+ self.verify_output(
+ output.out,
+ expected=[
+ '{}: OK'.format(filename),
+ '{} FileType: {}'.format(filename, expected_type),
+ ],
+ unexpected=[
+ '{} FileType: {}'.format(filename, file_type)
+ for file_type in unexpected_types
+ ]
+ )
+
+ non_html_xml_files = {
+ 'iframe_prefix.xml': b'<?xml version="1.0"?><root><iframeView/></root>',
+ 'iframe_upper_prefix.xml': b'<?xml version="1.0"?><root><IFRAMEVIEW/></root>',
+ 'img_prefix.xml': b'<?xml version="1.0"?><root><imgData/></root>',
+ 'img_mixed_prefix.xml': b'<?xml version="1.0"?><root><ImgData/></root>',
+ 'img_upper_prefix.xml': b'<?xml version="1.0"?><root><IMGDATA/></root>',
+ 'object_prefix.xml': b'<?xml version="1.0"?><root><objectId/></root>',
+ 'object_mixed_prefix.xml': b'<?xml version="1.0"?><root><ObjectId/></root>',
+ 'object_upper_prefix.xml': b'<?xml version="1.0"?><root><OBJECTID/></root>',
+ 'script_prefix.xml': b'<?xml version="1.0"?><root><scriptlet/></root>',
+ 'script_mixed_prefix.xml': b'<?xml version="1.0"?><root><Scriptlet/></root>',
+ 'script_upper_prefix.xml': b'<?xml version="1.0"?><root><SCRIPTLET/></root>',
+ 'table_styles.xml': (
+ b'<?xml version="1.0" encoding="UTF-8"?>'
+ b'<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">'
+ b'<dxfs count="0"/><tableStyles count="0" defaultTableStyle="TableStyleMedium2"/>'
+ b'</styleSheet>'
+ ),
+ 'table_upper_prefix.xml': b'<?xml version="1.0"?><root><TABLESTYLES/></root>',
+ }
+
+ for filename, contents in non_html_xml_files.items():
+ check_file_type(filename, contents, 'CL_TYPE_TEXT_ASCII', ['CL_TYPE_HTML'])
+
+ html_tag_files = {
+ 'iframe_close.html': b'<iframe></iframe>',
+ 'iframe_space.html': b'<iframe src="sample"></iframe>',
+ 'iframe_slash.html': b'<iframe/>',
+ 'iframe_uppercase.html': b'<IFRAME></IFRAME>',
+ 'img_close.html': b'<img>sample',
+ 'img_space.html': b'<img src="sample">',
+ 'img_slash.html': b'<img/>',
+ 'img_mixedcase.html': b'<Img>sample',
+ 'img_uppercase.html': b'<IMG>sample',
+ 'object_close.html': b'<object></object>',
+ 'object_space.html': b'<object data="sample"></object>',
+ 'object_slash.html': b'<object/>',
+ 'object_mixedcase.html': b'<Object></Object>',
+ 'object_uppercase.html': b'<OBJECT></OBJECT>',
+ 'script_close.html': b'<script></script>',
+ 'script_space.html': b'<script type="text/javascript"></script>',
+ 'script_slash.html': b'<script/>',
+ 'script_mixedcase.html': b'<Script></Script>',
+ 'script_uppercase.html': b'<SCRIPT></SCRIPT>',
+ 'table_close.html': b'<table></table>',
+ 'table_space.html': b'<table class="sample"></table>',
+ 'table_tab.html': b'<table\tclass="sample"></table>',
+ 'table_lf.html': b'<table\nclass="sample"></table>',
+ 'table_vtab.html': b'<table\vclass="sample"></table>',
+ 'table_ff.html': b'<table\fclass="sample"></table>',
+ 'table_cr.html': b'<table\rclass="sample"></table>',
+ 'table_slash.html': b'<table/>',
+ 'table_uppercase.html': b'<TABLE></TABLE>',
+ }
+
+ for filename, contents in html_tag_files.items():
+ check_file_type(filename, contents, 'CL_TYPE_HTML')