257 lines
10 KiB
Python
257 lines
10 KiB
Python
# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
|
|
|
|
# This file is part of the sos project: https://github.com/sosreport/sos
|
|
#
|
|
# This copyrighted material is made available to anyone wishing to use,
|
|
# modify, copy, or redistribute it subject to the terms and conditions of
|
|
# version 2 of the GNU General Public License.
|
|
#
|
|
# See the LICENSE file in the source distribution for further information.
|
|
|
|
import re
|
|
|
|
from sos.cleaner.mappings import SoSMap
|
|
|
|
|
|
class SoSHostnameMap(SoSMap):
|
|
"""Mapping store for hostnames and domain names
|
|
|
|
Hostnames are obfuscated using an incrementing counter based on the total
|
|
number of hosts matched regardless of domain name.
|
|
|
|
Domain names are obfuscated based on the host's hostname, plus any user
|
|
defined domains passed in by the `--domains` options.
|
|
|
|
Domains are obfuscated as whole units, meaning the domains 'example.com'
|
|
and 'host.foo.example.com' will be separately obfuscated with no relation
|
|
for example as 'obfuscatedomdain1.com' and 'obfuscatedomain2.com'.
|
|
|
|
Top-level domains are left untouched.
|
|
"""
|
|
|
|
ignore_matches = [
|
|
'localhost',
|
|
'.*localdomain.*',
|
|
'^com..*'
|
|
]
|
|
|
|
skip_keys = [
|
|
'www',
|
|
'api'
|
|
]
|
|
|
|
strip_exts = ('.yaml', '.yml', '.crt', '.key', '.pem', '.log', '.repo',
|
|
'.rules', '.conf', '.cfg')
|
|
|
|
ignore_short_items = True
|
|
match_full_words_only = True
|
|
host_count = 0
|
|
domain_count = 0
|
|
_domains = {}
|
|
hosts = {}
|
|
|
|
def load_domains_from_map(self):
|
|
"""Because we use 'intermediary' dicts for host names and domain names
|
|
in this parser, we need to re-inject entries from the map_file into
|
|
these dicts and not just the underlying 'dataset' dict
|
|
"""
|
|
for domain, ob_pair in self.dataset.items():
|
|
if len(domain.split('.')) == 1:
|
|
self.hosts[domain.split('.')[0]] = self.dataset[domain]
|
|
else:
|
|
if ob_pair.startswith('obfuscateddomain'):
|
|
# directly exact domain matches
|
|
self._domains[domain] = ob_pair.split('.')[0]
|
|
continue
|
|
# strip the host name and trailing top-level domain so that
|
|
# we in inject the domain properly for later string matching
|
|
|
|
# note: this is artificially complex due to our stance on
|
|
# preserving TLDs. If in the future the project decides to
|
|
# obfuscate TLDs as well somehow, then this will all become
|
|
# much simpler
|
|
_domain_to_inject = '.'.join(domain.split('.')[1:-1])
|
|
if not _domain_to_inject:
|
|
continue
|
|
for existing_domain, value in self.dataset.items():
|
|
_existing = '.'.join(existing_domain.split('.')[:-1])
|
|
if _existing == _domain_to_inject:
|
|
_ob_domain = '.'.join(value.split('.')[:-1])
|
|
self._domains[_domain_to_inject] = _ob_domain
|
|
self.set_initial_counts()
|
|
|
|
def get_regex_result(self, item):
|
|
"""Override the base get_regex_result() to provide a regex that, if
|
|
this is an FQDN or a straight domain, will include an underscore
|
|
formatted regex as well.
|
|
"""
|
|
if '.' in item:
|
|
item = item.replace('.', '(\\.|_)')
|
|
return super().get_regex_result(item)
|
|
|
|
def set_initial_counts(self):
|
|
"""Set the initial counter for host and domain obfuscation numbers
|
|
based on what is already present in the mapping.
|
|
"""
|
|
# hostnames/short names
|
|
try:
|
|
h = sorted(self.hosts.values(), reverse=True)[0].split('host')[1]
|
|
self.host_count = int(h) + 1
|
|
except IndexError:
|
|
# no hosts loaded yet
|
|
pass
|
|
|
|
# domain names
|
|
try:
|
|
d = sorted(self._domains.values(), reverse=True)[0].split('domain')
|
|
self.domain_count = int(d[1].split('.')[0]) + 1
|
|
except IndexError:
|
|
# no domains loaded yet
|
|
pass
|
|
|
|
def domain_name_in_loaded_domains(self, domain):
|
|
"""Check if a potential domain is in one of the domains we've loaded
|
|
and should be obfuscated
|
|
"""
|
|
if domain in self._domains:
|
|
return True
|
|
host = domain.split('.')
|
|
no_tld = '.'.join(domain.split('.')[0:-1])
|
|
if len(host) == 1:
|
|
# don't block on host's shortname
|
|
return host[0] in self.hosts
|
|
if any(no_tld.endswith(_d) for _d in self._domains):
|
|
return True
|
|
|
|
return False
|
|
|
|
def get(self, item):
|
|
# pylint: disable=too-many-branches
|
|
prefix = ''
|
|
suffix = ''
|
|
final = None
|
|
# The regex pattern match may include a leading and/or trailing '_'
|
|
# character due to the need to use word boundary matching, so we need
|
|
# to strip these from the string during processing, but still keep them
|
|
# in the returned string to not mangle the string replacement in the
|
|
# context of the file or filename
|
|
while item.startswith(('.', '_')):
|
|
prefix += item[0]
|
|
item = item[1:]
|
|
while item.endswith(('.', '_')):
|
|
suffix += item[-1]
|
|
item = item[0:-1]
|
|
if item in self.dataset:
|
|
return self.dataset[item]
|
|
if not self.domain_name_in_loaded_domains(item.lower()):
|
|
# no match => return the original string with optional
|
|
# leading/trailing '.' or '_' characters
|
|
return ''.join([prefix, item, suffix])
|
|
if item.endswith(self.strip_exts):
|
|
ext = '.' + item.split('.')[-1]
|
|
item = item.replace(ext, '')
|
|
suffix += ext
|
|
if item not in self.dataset:
|
|
# try to account for use of '-' in names that include hostnames
|
|
# and don't create new mappings for each of these
|
|
for _existing in sorted(self.dataset.keys(), reverse=True,
|
|
key=len):
|
|
_host_substr = False
|
|
_test = item.split(_existing)
|
|
_h = _existing.split('.')
|
|
# avoid considering a full FQDN match as a new match off of
|
|
# the hostname of an existing match
|
|
if _h[0] and _h[0] in self.hosts:
|
|
_host_substr = True
|
|
if len(_test) == 1 or not _test[0]:
|
|
# does not match existing obfuscation
|
|
continue
|
|
if not _host_substr and (_test[0].endswith('.') or
|
|
item.endswith(_existing)):
|
|
# new hostname in known domain
|
|
final = super().get(item)
|
|
break
|
|
if item.split(_test[0]):
|
|
# string that includes existing FQDN obfuscation substring
|
|
# so, only obfuscate the FQDN part
|
|
try:
|
|
itm = item.split(_test[0])[1]
|
|
final = _test[0] + super().get(itm)
|
|
break
|
|
except Exception:
|
|
# fallback to still obfuscating the entire item
|
|
pass
|
|
|
|
if not final:
|
|
final = super().get(item)
|
|
return prefix + final + suffix
|
|
|
|
def sanitize_item(self, item):
|
|
host = item.split('.')
|
|
if len(host) == 1:
|
|
# we have a shortname for a host
|
|
return self.sanitize_short_name(host[0].lower())
|
|
if len(host) == 2:
|
|
# we have just a domain name, e.g. example.com
|
|
dname = self.sanitize_domain(host)
|
|
if all(h.isupper() for h in host):
|
|
dname = dname.upper()
|
|
return dname
|
|
if len(host) > 2:
|
|
# we have an FQDN, e.g. foo.example.com
|
|
hostname = host[0]
|
|
domain = host[1:]
|
|
# obfuscate the short name
|
|
if len(hostname) > 2:
|
|
ob_hostname = self.sanitize_short_name(hostname.lower())
|
|
else:
|
|
# by best practice it appears the host part of the fqdn was cut
|
|
# off due to some form of truncating, as such don't obfuscate
|
|
# short strings that are likely to throw off obfuscation of
|
|
# unrelated bits and paths
|
|
ob_hostname = 'unknown'
|
|
ob_domain = self.sanitize_domain(domain)
|
|
self.dataset[item] = ob_domain
|
|
_fqdn = '.'.join([ob_hostname, ob_domain])
|
|
if all(h.isupper() for h in host):
|
|
_fqdn = _fqdn.upper()
|
|
return _fqdn
|
|
return None
|
|
|
|
def sanitize_short_name(self, hostname):
|
|
"""Obfuscate the short name of the host with an incremented counter
|
|
based on the total number of obfuscated host names
|
|
"""
|
|
if not hostname or hostname in self.skip_keys:
|
|
return hostname
|
|
if hostname not in self.dataset:
|
|
ob_host = f"host{self.host_count}"
|
|
self.hosts[hostname] = ob_host
|
|
self.host_count += 1
|
|
self.dataset[hostname] = ob_host
|
|
self.add_regex_item(hostname)
|
|
return self.dataset[hostname]
|
|
|
|
def sanitize_domain(self, domain):
|
|
"""Obfuscate the domainname, broken out into subdomains. Top-level
|
|
domains are ignored.
|
|
"""
|
|
for _skip in self.ignore_matches:
|
|
# don't obfuscate vendor domains
|
|
if re.match(_skip, '.'.join(domain)):
|
|
return '.'.join(domain)
|
|
top_domain = domain[-1].lower()
|
|
dname = '.'.join(domain[0:-1]).lower()
|
|
ob_domain = self._new_obfuscated_domain(dname)
|
|
ob_domain = '.'.join([ob_domain, top_domain])
|
|
self.dataset['.'.join(domain)] = ob_domain
|
|
return ob_domain
|
|
|
|
def _new_obfuscated_domain(self, dname):
|
|
"""Generate an obfuscated domain for each subdomain name given
|
|
"""
|
|
if dname not in self._domains:
|
|
self._domains[dname] = f"obfuscateddomain{self.domain_count}"
|
|
self.domain_count += 1
|
|
return self._domains[dname]
|