making webwatcher better hopefully

This commit is contained in:
2025-10-17 11:47:19 -07:00
parent b39d96f4a0
commit 5e857055e4

View File

@@ -93,15 +93,16 @@ def main():
sys.exit(3)
try:
matches = re.findall(args.pattern, html, flags)
# If the regex has groups, re.findall returns tuples; normalize to strings
norm = []
for m in matches:
if isinstance(m, tuple):
norm.append(" | ".join(m))
else:
norm.append(m)
unique = set(norm)
# Capture surrounding context so multiple "Beherit" items are treated separately
context_window = 120 # number of characters around each match to include
unique = set()
for match in re.finditer(args.pattern, html, flags):
start = max(match.start() - context_window, 0)
end = min(match.end() + context_window, len(html))
snippet = html[start:end].replace("\n", " ")
unique.add(snippet.strip())
except re.error as e:
log_line(log_file, f"ERROR regex '{args.pattern}': {e}")
sys.exit(4)