diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 81e3d0a3d601d6dbf451343a34f97d208cd5e424..6e0682b0a5dbbce6235db1e3aae9f8fea16c0c8c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,7 +1,7 @@ variables: # note: GitLab cannot reference variables defined by users in the include ref:, we need to use a YAML anchor for this # see https://docs.gitlab.com/ci/yaml/includes/#use-variables-with-include for more information - IVAS_CODEC_CI_REF: &IVAS_CODEC_CI_REF main + IVAS_CODEC_CI_REF: &IVAS_CODEC_CI_REF kiene/ubsan-error-reporting # If you need to set some config variable only in a local branch, then add an overwrite here # One example is DISABLE_HRTF - this will be set on a branch which is about to be merged and will be removed in a subsequent second MR # this is more easily done directly here in the child repo diff --git a/scripts/parse_sanitizer_errors_from_xml_report.py b/scripts/parse_sanitizer_errors_from_xml_report.py new file mode 100644 index 0000000000000000000000000000000000000000..82628c749bd9f0746ab93689ea2f1c19fb81f45e --- /dev/null +++ b/scripts/parse_sanitizer_errors_from_xml_report.py @@ -0,0 +1,219 @@ +#!/usr/env python3 + +import pandas as pd +from xml.etree import ElementTree +import argparse +from enum import Enum +from typing import List, Tuple +import re +import os +from pathlib import Path +import logging + + +logging.basicConfig(level=logging.INFO) + + +class SanitizerError: + SUMMARY_ID = "" + + def __init__( + self, traceback: str, commandlines: dict, testcase: str, cwd: Path = Path(".") + ) -> None: + self.traceback = traceback + self.commandlines = commandlines + self.testcase = testcase + self.type, self.location = self.parse_type_and_location(traceback, cwd) + + def __hash__(self): + return hash(self.location) + + def __eq__(self, other): + return self.location == other.location + + def __repr__(self): + return f"<{self.__class__.__name__} at {self.location}>" + + def __lt__(self, other): + # order by string comparison of location as first criterion + # if location is the same in both instances, the smaller one is the one with more found command lines + if self.location != other.location: + return self.location < other.location + else: + num_cmdl_self = list(self.commandlines.values()).count("") + num_cmdl_other = list(other.commandlines.values()).count("") + return num_cmdl_self > num_cmdl_other + + def to_dict(self) -> dict: + return { + "testcase": self.testcase, + "sanitizer": self.__class__.__name__.replace("Error", "").upper(), + "location": self.location, + "type": self.type, + "traceback": self.traceback, + **self.commandlines, + } + + def parse_type_and_location(self, traceback, cwd) -> Tuple[str, str]: + last_line = traceback.split("\n")[-1].strip() + assert last_line.startswith(f"SUMMARY: {self.SUMMARY_ID}") + m = re.match( + r"SUMMARY: " + self.SUMMARY_ID + r": ([a-z-]*) (.*\/.*\.[ch]:\d+:\d+) in", + last_line, + ) + assert m is not None + + type, location = m.groups() + + if Path(location).is_absolute(): + location = str(Path(location).relative_to(cwd)) + return type, location + + +class UsanError(SanitizerError): + SUMMARY_ID = "UndefinedBehaviorSanitizer" + + +class MsanError(SanitizerError): + SUMMARY_ID = "MemorySanitizer" + + +def parse_commandlines_from_sysout(sysout: str, cwd: Path) -> dict: + commandlines = { + "IVAS_cod": "", + "networkSimulator_g192": "", + "eid-xor": "", + "IVAS_dec": "", + "IVAS_rend": "", + "ISAR_post_rend": "", + } + for line in sysout.splitlines(): + for exe in commandlines: + # search for name of executable in line + # it is repeated in the sanitizer traceback, hence the "not in" part + # the "not at the start" condition is for eid-xor (there are also lines like this: "eid-xor command:") + if ( + re.search(exe, line) is not None + and " in _start " not in line + and not line.strip().startswith(exe) + ): + if commandlines[exe] != "": + logging.debug( + f"Commandline for {exe} already found, skip second one." + ) + else: + commandlines[exe] = postprocess_cmdline(line.strip(), cwd) + + # assumption: only one commandline per line + break + + return commandlines + + +def postprocess_cmdline(cmdline: str, cwd: Path) -> str: + cmdline_split = cmdline.split() + cmdline_proc = [] + + # change absolute paths into relative ones + # remove the "quite" flag + # for output and bitstream files only keep the filename + for elem in cmdline_split: + if elem == "-q": + continue + elif (elem_as_path := Path(elem)).is_absolute(): + if ( + elem_as_path.suffix == ".192" + or elem_as_path.suffix == ".netsimtrace" + or ( + elem_as_path.suffix == ".wav" + and cmdline_split.index(elem) == len(cmdline_split) - 1 + ) + ): + cmdline_proc.append(elem_as_path.name) + else: + cmdline_proc.append(str(elem_as_path.relative_to(cwd))) + else: + cmdline_proc.append(elem) + + return " ".join(cmdline_proc) + + +def parse_errors_from_sysout( + sysout: str, testcase_name: str, cwd: Path +) -> List[UsanError]: + logging.debug(testcase_name) + commandlines = parse_commandlines_from_sysout(sysout, cwd) + errors = [] + + class ParserState(Enum): + OUT = 0 + IN = 1 + + pattern_usan = re.compile(r"(lib_.+|apps)\/(.*\.[ch]):(\d+):(\d+): runtime error:") + pattern_msan = re.compile(r" MemorySanitizer: ") + + state = ParserState.OUT + accu = [] + err_cls = None + for l in sysout.splitlines(): + line = l.strip() + + m_usan = re.search(pattern_usan, line) + m_msan = re.search(pattern_msan, line) + + usan_start_found = m_usan is not None + msan_start_found = m_msan is not None and not line.startswith("SUMMARY:") + + assert usan_start_found != msan_start_found or ( + not usan_start_found and not msan_start_found + ) + if usan_start_found or msan_start_found: + assert state == ParserState.OUT + state = ParserState.IN + accu = [] + err_cls = UsanError if m_usan is not None else MsanError + + if state == ParserState.IN: + accu.append(line) + + if line.startswith("SUMMARY:"): + assert state == ParserState.IN + + errors.append(err_cls("\n".join(accu), commandlines, testcase_name, cwd)) + state = ParserState.OUT + + return errors + + +def main(args): + tree = ElementTree.parse(args.xml_report) + root = tree.getroot() + + errors = [] + for tc in root[0].findall("testcase"): + tc_name = tc.attrib["name"] + for sysout in tc.findall("system-out"): + errors.extend( + parse_errors_from_sysout(sysout.text, tc_name, args.inject_cwd) + ) + + unique_errors = list(sorted(set(sorted(errors)))) + print(f"Found {len(unique_errors)} unique errors") + + df = pd.DataFrame([e.to_dict() for e in unique_errors]) + df.to_csv(args.outfile, index=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("xml_report") + parser.add_argument("outfile") + parser.add_argument( + "--inject_cwd", + help="Use this as cwd when pruning the long paths in the command lines. Debug option for testing.", + default=Path(os.getcwd()).absolute(), + type=Path, + ) + + args = parser.parse_args() + main(args)