small fixes
This commit is contained in:
parent
a8b1c5d3e6
commit
8d7bcf8c46
8 changed files with 641 additions and 9 deletions
137
Corpus/Standards/ISO-27002-OST/check_references.py
Executable file
137
Corpus/Standards/ISO-27002-OST/check_references.py
Executable file
|
|
@ -0,0 +1,137 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def version_parse(v):
|
||||
return tuple(map(int, (v.split("."))))
|
||||
|
||||
|
||||
# Configuration
|
||||
EN_FOLDER = "ISO27002-EN-2022"
|
||||
NL_FOLDER = "ISO27002-NL-2022"
|
||||
EN_PATTERN = re.compile(r"ISO_27002_2022_([\d\.]+)_OT.*\.md")
|
||||
NL_PATTERN = re.compile(r"ISO_27002_2022_NL_([\d\.]+)_BT.*\.md")
|
||||
# Reference patterns
|
||||
EN_REF_PATTERN = re.compile(r"\(see ([\d\.]+)\)|in ([\d\.]+)\)")
|
||||
NL_REF_PATTERN = re.compile(r"\(zie \[?([\d\.]+)\]?\(?[^)]*\)?\)|in ([\d\.]+)\)")
|
||||
|
||||
|
||||
def extract_references(file_path, pattern):
|
||||
"""Extract all reference numbers from a file"""
|
||||
references = set()
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
matches = pattern.findall(content)
|
||||
for match in matches:
|
||||
# Either group 1 or group 2 will have the value
|
||||
ref = match[0] if match[0] else match[1]
|
||||
references.add(ref)
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
return sorted(references, key=version_parse)
|
||||
|
||||
|
||||
def main():
|
||||
# Index files by section number
|
||||
en_files = {}
|
||||
nl_files = {}
|
||||
|
||||
# Scan EN folder
|
||||
for filename in os.listdir(EN_FOLDER):
|
||||
if not filename.endswith(".md"):
|
||||
continue
|
||||
match = EN_PATTERN.match(filename)
|
||||
if match:
|
||||
section = match.group(1)
|
||||
en_files[section] = os.path.join(EN_FOLDER, filename)
|
||||
|
||||
# Scan NL folder
|
||||
for filename in os.listdir(NL_FOLDER):
|
||||
if not filename.endswith(".md"):
|
||||
continue
|
||||
match = NL_PATTERN.match(filename)
|
||||
if match:
|
||||
section = match.group(1)
|
||||
nl_files[section] = os.path.join(NL_FOLDER, filename)
|
||||
|
||||
mismatches = []
|
||||
matched = 0
|
||||
|
||||
# Compare each matching pair
|
||||
for section in en_files:
|
||||
if section not in nl_files:
|
||||
mismatches.append({"section": section, "missing_nl": True})
|
||||
continue
|
||||
|
||||
matched += 1
|
||||
en_refs = extract_references(en_files[section], EN_REF_PATTERN)
|
||||
nl_refs = extract_references(nl_files[section], NL_REF_PATTERN)
|
||||
|
||||
if set(en_refs) != set(nl_refs):
|
||||
mismatches.append(
|
||||
{
|
||||
"section": section,
|
||||
"en_file": Path(en_files[section]).name,
|
||||
"nl_file": Path(nl_files[section]).name,
|
||||
"en_refs": en_refs,
|
||||
"nl_refs": nl_refs,
|
||||
"only_en": sorted(set(en_refs) - set(nl_refs), key=version_parse),
|
||||
"only_nl": sorted(set(nl_refs) - set(en_refs), key=version_parse),
|
||||
}
|
||||
)
|
||||
|
||||
# Sort mismatches by section number properly
|
||||
mismatches.sort(key=lambda x: version_parse(x["section"]))
|
||||
|
||||
# Generate Markdown report
|
||||
report_content = []
|
||||
report_content.append("# ISO 27002:2022 Reference Mismatch Report")
|
||||
report_content.append("")
|
||||
report_content.append(f"**Generated:** {os.popen('date -Iseconds').read().strip()}")
|
||||
report_content.append("")
|
||||
report_content.append("## Summary")
|
||||
report_content.append(f"- Total EN files: {len(en_files)}")
|
||||
report_content.append(f"- Total NL files: {len(nl_files)}")
|
||||
report_content.append(f"- Matched file pairs: {matched}")
|
||||
report_content.append(f"- Files with mismatched references: {len(mismatches)}")
|
||||
report_content.append("")
|
||||
report_content.append("---")
|
||||
report_content.append("")
|
||||
|
||||
for item in mismatches:
|
||||
report_content.append(f"## Section {item['section']}")
|
||||
report_content.append("")
|
||||
report_content.append(f"- **EN file**: `{item['en_file']}`")
|
||||
report_content.append(f"- **NL file**: `{item['nl_file']}`")
|
||||
report_content.append("")
|
||||
report_content.append("| Language | References |")
|
||||
report_content.append("|----------|------------|")
|
||||
report_content.append(
|
||||
f"| English | {', '.join(item['en_refs']) if item['en_refs'] else '*None*'} |"
|
||||
)
|
||||
report_content.append(
|
||||
f"| Dutch | {', '.join(item['nl_refs']) if item['nl_refs'] else '*None*'} |"
|
||||
)
|
||||
report_content.append("")
|
||||
if item["only_en"]:
|
||||
report_content.append(f"✅ **Only in EN**: {', '.join(item['only_en'])}")
|
||||
if item["only_nl"]:
|
||||
report_content.append(f"❌ **Only in NL**: {', '.join(item['only_nl'])}")
|
||||
report_content.append("")
|
||||
report_content.append("---")
|
||||
report_content.append("")
|
||||
|
||||
# Write markdown file
|
||||
with open("reference_mismatch_report.md", "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(report_content))
|
||||
|
||||
print(f"Report written to reference_mismatch_report.md")
|
||||
print(f"Found {len(mismatches)} mismatched files, sorted by section number")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue