#!/usr/bin/env python3 import os import re from pathlib import Path def version_parse(v): return tuple(map(int, (v.split(".")))) # Configuration EN_FOLDER = "ISO27002-EN-2022" NL_FOLDER = "ISO27002-NL-2022" EN_PATTERN = re.compile(r"ISO_27002_2022_([\d\.]+)_OT.*\.md") NL_PATTERN = re.compile(r"ISO_27002_2022_NL_([\d\.]+)_BT.*\.md") # Reference patterns EN_REF_PATTERN = re.compile(r"\(see ([\d\.]+)\)|in ([\d\.]+)\)") NL_REF_PATTERN = re.compile(r"\(zie \[?([\d\.]+)\]?\(?[^)]*\)?\)|in ([\d\.]+)\)") def extract_references(file_path, pattern): """Extract all reference numbers from a file""" references = set() try: with open(file_path, "r", encoding="utf-8") as f: content = f.read() matches = pattern.findall(content) for match in matches: # Either group 1 or group 2 will have the value ref = match[0] if match[0] else match[1] references.add(ref) except Exception as e: print(f"Error reading {file_path}: {e}") return sorted(references, key=version_parse) def main(): # Index files by section number en_files = {} nl_files = {} # Scan EN folder for filename in os.listdir(EN_FOLDER): if not filename.endswith(".md"): continue match = EN_PATTERN.match(filename) if match: section = match.group(1) en_files[section] = os.path.join(EN_FOLDER, filename) # Scan NL folder for filename in os.listdir(NL_FOLDER): if not filename.endswith(".md"): continue match = NL_PATTERN.match(filename) if match: section = match.group(1) nl_files[section] = os.path.join(NL_FOLDER, filename) mismatches = [] matched = 0 # Compare each matching pair for section in en_files: if section not in nl_files: mismatches.append({"section": section, "missing_nl": True}) continue matched += 1 en_refs = extract_references(en_files[section], EN_REF_PATTERN) nl_refs = extract_references(nl_files[section], NL_REF_PATTERN) if set(en_refs) != set(nl_refs): mismatches.append( { "section": section, "en_file": Path(en_files[section]).name, "nl_file": Path(nl_files[section]).name, "en_refs": en_refs, "nl_refs": nl_refs, "only_en": sorted(set(en_refs) - set(nl_refs), key=version_parse), "only_nl": sorted(set(nl_refs) - set(en_refs), key=version_parse), } ) # Sort mismatches by section number properly mismatches.sort(key=lambda x: version_parse(x["section"])) # Generate Markdown report report_content = [] report_content.append("# ISO 27002:2022 Reference Mismatch Report") report_content.append("") report_content.append(f"**Generated:** {os.popen('date -Iseconds').read().strip()}") report_content.append("") report_content.append("## Summary") report_content.append(f"- Total EN files: {len(en_files)}") report_content.append(f"- Total NL files: {len(nl_files)}") report_content.append(f"- Matched file pairs: {matched}") report_content.append(f"- Files with mismatched references: {len(mismatches)}") report_content.append("") report_content.append("---") report_content.append("") for item in mismatches: report_content.append(f"## Section {item['section']}") report_content.append("") report_content.append(f"- **EN file**: `{item['en_file']}`") report_content.append(f"- **NL file**: `{item['nl_file']}`") report_content.append("") report_content.append("| Language | References |") report_content.append("|----------|------------|") report_content.append( f"| English | {', '.join(item['en_refs']) if item['en_refs'] else '*None*'} |" ) report_content.append( f"| Dutch | {', '.join(item['nl_refs']) if item['nl_refs'] else '*None*'} |" ) report_content.append("") if item["only_en"]: report_content.append(f"✅ **Only in EN**: {', '.join(item['only_en'])}") if item["only_nl"]: report_content.append(f"❌ **Only in NL**: {', '.join(item['only_nl'])}") report_content.append("") report_content.append("---") report_content.append("") # Write markdown file with open("reference_mismatch_report.md", "w", encoding="utf-8") as f: f.write("\n".join(report_content)) print(f"Report written to reference_mismatch_report.md") print(f"Found {len(mismatches)} mismatched files, sorted by section number") if __name__ == "__main__": main()