making webwatcher better hopefully
This commit is contained in:
@@ -93,15 +93,16 @@ def main():
|
|||||||
sys.exit(3)
|
sys.exit(3)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
matches = re.findall(args.pattern, html, flags)
|
# Capture surrounding context so multiple "Beherit" items are treated separately
|
||||||
# If the regex has groups, re.findall returns tuples; normalize to strings
|
context_window = 120 # number of characters around each match to include
|
||||||
norm = []
|
unique = set()
|
||||||
for m in matches:
|
|
||||||
if isinstance(m, tuple):
|
for match in re.finditer(args.pattern, html, flags):
|
||||||
norm.append(" | ".join(m))
|
start = max(match.start() - context_window, 0)
|
||||||
else:
|
end = min(match.end() + context_window, len(html))
|
||||||
norm.append(m)
|
snippet = html[start:end].replace("\n", " ")
|
||||||
unique = set(norm)
|
unique.add(snippet.strip())
|
||||||
|
|
||||||
except re.error as e:
|
except re.error as e:
|
||||||
log_line(log_file, f"ERROR regex '{args.pattern}': {e}")
|
log_line(log_file, f"ERROR regex '{args.pattern}': {e}")
|
||||||
sys.exit(4)
|
sys.exit(4)
|
||||||
|
|||||||
Reference in New Issue
Block a user