iso27diy-corp/Corpus/Standards/ISO-27002-OST/check_references.py
2026-04-21 15:16:17 +02:00

137 lines
4.7 KiB
Python
Executable file

#!/usr/bin/env python3
import os
import re
from pathlib import Path
def version_parse(v):
return tuple(map(int, (v.split("."))))
# Configuration
EN_FOLDER = "ISO27002-EN-2022"
NL_FOLDER = "ISO27002-NL-2022"
EN_PATTERN = re.compile(r"ISO_27002_2022_([\d\.]+)_OT.*\.md")
NL_PATTERN = re.compile(r"ISO_27002_2022_NL_([\d\.]+)_BT.*\.md")
# Reference patterns
EN_REF_PATTERN = re.compile(r"\(see ([\d\.]+)\)|in ([\d\.]+)\)")
NL_REF_PATTERN = re.compile(r"\(zie \[?([\d\.]+)\]?\(?[^)]*\)?\)|in ([\d\.]+)\)")
def extract_references(file_path, pattern):
"""Extract all reference numbers from a file"""
references = set()
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
matches = pattern.findall(content)
for match in matches:
# Either group 1 or group 2 will have the value
ref = match[0] if match[0] else match[1]
references.add(ref)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return sorted(references, key=version_parse)
def main():
# Index files by section number
en_files = {}
nl_files = {}
# Scan EN folder
for filename in os.listdir(EN_FOLDER):
if not filename.endswith(".md"):
continue
match = EN_PATTERN.match(filename)
if match:
section = match.group(1)
en_files[section] = os.path.join(EN_FOLDER, filename)
# Scan NL folder
for filename in os.listdir(NL_FOLDER):
if not filename.endswith(".md"):
continue
match = NL_PATTERN.match(filename)
if match:
section = match.group(1)
nl_files[section] = os.path.join(NL_FOLDER, filename)
mismatches = []
matched = 0
# Compare each matching pair
for section in en_files:
if section not in nl_files:
mismatches.append({"section": section, "missing_nl": True})
continue
matched += 1
en_refs = extract_references(en_files[section], EN_REF_PATTERN)
nl_refs = extract_references(nl_files[section], NL_REF_PATTERN)
if set(en_refs) != set(nl_refs):
mismatches.append(
{
"section": section,
"en_file": Path(en_files[section]).name,
"nl_file": Path(nl_files[section]).name,
"en_refs": en_refs,
"nl_refs": nl_refs,
"only_en": sorted(set(en_refs) - set(nl_refs), key=version_parse),
"only_nl": sorted(set(nl_refs) - set(en_refs), key=version_parse),
}
)
# Sort mismatches by section number properly
mismatches.sort(key=lambda x: version_parse(x["section"]))
# Generate Markdown report
report_content = []
report_content.append("# ISO 27002:2022 Reference Mismatch Report")
report_content.append("")
report_content.append(f"**Generated:** {os.popen('date -Iseconds').read().strip()}")
report_content.append("")
report_content.append("## Summary")
report_content.append(f"- Total EN files: {len(en_files)}")
report_content.append(f"- Total NL files: {len(nl_files)}")
report_content.append(f"- Matched file pairs: {matched}")
report_content.append(f"- Files with mismatched references: {len(mismatches)}")
report_content.append("")
report_content.append("---")
report_content.append("")
for item in mismatches:
report_content.append(f"## Section {item['section']}")
report_content.append("")
report_content.append(f"- **EN file**: `{item['en_file']}`")
report_content.append(f"- **NL file**: `{item['nl_file']}`")
report_content.append("")
report_content.append("| Language | References |")
report_content.append("|----------|------------|")
report_content.append(
f"| English | {', '.join(item['en_refs']) if item['en_refs'] else '*None*'} |"
)
report_content.append(
f"| Dutch | {', '.join(item['nl_refs']) if item['nl_refs'] else '*None*'} |"
)
report_content.append("")
if item["only_en"]:
report_content.append(f"✅ **Only in EN**: {', '.join(item['only_en'])}")
if item["only_nl"]:
report_content.append(f"❌ **Only in NL**: {', '.join(item['only_nl'])}")
report_content.append("")
report_content.append("---")
report_content.append("")
# Write markdown file
with open("reference_mismatch_report.md", "w", encoding="utf-8") as f:
f.write("\n".join(report_content))
print(f"Report written to reference_mismatch_report.md")
print(f"Found {len(mismatches)} mismatched files, sorted by section number")
if __name__ == "__main__":
main()