# # Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. # See https://github.com/nexB/scancode-toolkit for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # import re import string """ Extract raw ASCII strings from (possibly) binary strings. Both plain ASCII and UTF-16-LE-encoded (aka. wide) strings are extracted. The later is found typically in some Windows PEs. This is more or less similar to what GNU Binutils strings does. Does not recognize and extract non-ASCII characters Some alternative and references: https://github.com/fireeye/flare-floss (also included) http://stackoverflow.com/questions/10637055/how-do-i-extract-unicode-character-sequences-from-an-mz-executable-file http://stackoverflow.com/questions/1324067/how-do-i-get-str-translate-to-work-with-unicode-strings http://stackoverflow.com/questions/11066400/remove-punctuation-from-unicode-formatted-strings/11066687#11066687 https://github.com/TakahiroHaruyama/openioc_scan/blob/d7e8c5962f77f55f9a5d34dbfd0799f8c57eff7f/openioc_scan.py#L184 """ # at least four characters are needed to consider some blob as a good string # this is the same default as GNU strings MIN_LEN = 4 MIN_LEN_STR = b'4' def strings_from_file(location, buff_size=1024 * 1024, clean=True, min_len=MIN_LEN): """ Yield unicode strings made only of printable ASCII characters found in file at `location``. Process the file in chunks of `buff_size` bytes (to limit memory usage). """ with open(location, 'rb') as f: while 1: buf = f.read(buff_size) if not buf: break for s in strings_from_string(buf, clean=clean, min_len=min_len): s = s.strip() if len(s) >= min_len: yield s # Extracted text is digit, letters, punctuation and white spaces punctuation = re.escape(b"""!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~""") whitespaces = b' \\t\\n\\r\t\n\r' printable = b'A-Za-z0-9' + whitespaces + punctuation null_byte = b'\x00' _ascii_pattern = ( # plain ASCII is a sequence of printable of a minimum length b'(' +b'[' + printable + b']' +b'{' + MIN_LEN_STR + b',}' +b')' # or utf-16-le-encoded ASCII is a sequence of ASCII+null byte +b'|' +b'(' +b'(?:' + b'[' + printable + b']' + null_byte + b')' +b'{' + MIN_LEN_STR + b',}' +b')' ) ascii_strings = re.compile(_ascii_pattern).finditer replace_literal_line_returns = re.compile( '[\\n\\r]+$' ).sub def normalize_line_ends(s): """ Replace trailing literal line returns by real line return (e.g. POSIX LF aka. \n) in string `s`. """ return replace_literal_line_returns('\n', s) def strings_from_string(binary_string, clean=False, min_len=0): """ Yield strings extracted from a (possibly binary) string `binary_string`. The strings are ASCII printable characters only. If `clean` is True, also clean and filter short and repeated strings. Note: we do not keep the offset of where a string was found (e.g. match.start). """ for match in ascii_strings(binary_string): s = decode(match.group()) if not s: continue s = normalize_line_ends(s) for line in s.splitlines(False): line = line.strip() if len(line) < min_len: continue if clean: for ss in clean_string(line, min_len=min_len): yield ss else: yield line def string_from_string(binary_string, clean=False, min_len=0): """ Return a unicode string string extracted from a (possibly binary) string, removing all non printable characters. """ return u' '.join(strings_from_string(binary_string, clean, min_len)) def decode(s): """ Return a decoded unicode string from s or None if the string cannot be decoded. """ if b'\x00' in s: try: return s.decode('utf-16-le') except UnicodeDecodeError: pass else: return s.decode('ascii') remove_junk = re.compile('[' + punctuation.decode('utf-8') + whitespaces.decode('utf-8') + ']').sub JUNK = frozenset(string.punctuation + string.digits + string.whitespace) def clean_string(s, min_len=MIN_LEN, junk=JUNK): """ Yield cleaned strings from string s if it passes some validity tests: * not made of white spaces * with a minimum length ignoring spaces and punctuations * not made of only two repeated character * not made of only of digits, punctuations and whitespaces """ s = s.strip() def valid(st): st = remove_junk('', st) return (st and len(st) >= min_len # ignore character repeats, e.g need more than two unique characters and len(set(st.lower())) > 1 # ignore string made only of digits, spaces or punctuations and not all(c in junk for c in st)) if valid(s): yield s.strip() ##################################################################################### # TODO: Strings classification # Classify strings, detect junk, detect paths, symbols, demangle symbols, unescape # http://code.activestate.com/recipes/466293-efficient-character-escapes-decoding/?in=user-2382677 def is_file(s): """ Return True if s looks like a file name. Exmaple: dsdsd.dll """ filename = re.compile('^[\\w_\\-]+\\.\\w{1,4}$', re.IGNORECASE).match return filename(s) def is_shared_object(s): """ Return True if s looks like a shared object file. Example: librt.so.1 """ so = re.compile('^[\\w_\\-]+\\.so\\.[0-9]+\\.*.[0-9]*$', re.IGNORECASE).match return so(s) # TODO: implement me _posix = re.compile('^/[\\w_\\-].*$', re.IGNORECASE).match def is_posix_path(s): """ Return True if s looks like a posix path. Example: /usr/lib/librt.so.1 or /usr/lib """ return _posix(s) # TODO: implement me _relative = re.compile('^(?:([^/]|\\.\\.)[\\w_\\-]+/.*$)', re.IGNORECASE).match def is_relative_path(s): """ Return True if s looks like a relative posix path. Example: usr/lib/librt.so.1 or ../usr/lib """ return bool(_relative(s)) _winpath = re.compile('^[\\w_\\-]+\\.so\\.[0-9]+\\.*.[0-9]*$', re.IGNORECASE).match def is_win_path(s): """ Return True if s looks like a win path. Example: c:\\usr\\lib\\librt.so.1. """ return _winpath(s) def is_c_source(s): """ Return True if s looks like a C source path. Example: this.c FIXME: should get actual algo from contenttype. """ return s.endswith(('.c', '.cpp', '.hpp', '.h')) def is_java_source(s): """ Return True if s looks like a Java source path. Example: this.java FIXME: should get actual algo from contenttype. """ return s.endswith(('.java', '.jsp', '.aj',)) def is_glibc_ref(s): """ Return True if s looks like a reference to GLIBC as typically found in Elfs. """ return '@@GLIBC' in s def is_java_ref(s): """ Return True if s looks like a reference to a java class or package in a class file. """ jref = re.compile('^.*$', re.IGNORECASE).match # TODO: implement me jref(s) return False def is_win_guid(s): """ Return True if s looks like a windows GUID/APPID/CLSID. """ guid = re.compile('"\\{[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\\}"', re.IGNORECASE).match # TODO: implement me guid(s) return False class BinaryStringsClassifier(object): """ Classify extracted strings as good or bad/junk. The types of strings that are recognized include: file file_path junk text """ # TODO: Implement me # TODO: a new approach to more aggressively filter strings def filter_strings(strs, nglen=4): """ Filter cluster of short strings. If a string two previous and next neighbors and itself have a small length less than mlen, discard that string. """ from licensedcode.tokenize import ngrams # FIXME: the ngrams function skips things if we have less than ngram_len strings strs = list(strs) if len(strs) < nglen: for s in strs: yield s else: for ngm in ngrams(strs, nglen): junk = (all(len(s) <= 5 for s in ngm) or sum(len(s) for s in ngm) <= nglen * 5 or len(set(ngm[0])) / float(len(ngm[0])) < 0.01) if junk: continue yield ngm[0] if __name__ == '__main__': # also usable a simple command line script import sys location = sys.argv[1] for s in strings_from_file(location): print(s)