web wartcher
This commit is contained in:
@@ -1,129 +1,40 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse, hashlib, json, os, re, sys, time, subprocess
|
import os, re, json, subprocess
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import urlopen, Request
|
||||||
from urllib.error import URLError, HTTPError
|
|
||||||
|
|
||||||
|
URL = "https://shop.nwnprod.com" # page to watch
|
||||||
|
STATE = os.path.expanduser("~/.local/state/webwatch_beherit.json")
|
||||||
|
|
||||||
def notify(summary, body, urgency="critical"):
|
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) " "Gecko/20100101 Firefox/128.0"
|
||||||
cmd = ["notify-send", "-u", urgency, summary, body]
|
|
||||||
try:
|
|
||||||
subprocess.run(cmd, check=False)
|
|
||||||
except Exception:
|
|
||||||
pass # don't crash on notification issues
|
|
||||||
|
|
||||||
|
# --- fetch html ---
|
||||||
|
req = Request(URL, headers={"User-Agent": USER_AGENT})
|
||||||
|
html = urlopen(req, timeout=15).read().decode("utf-8", "ignore")
|
||||||
|
|
||||||
def ensure_dirs():
|
# --- find product IDs and titles that contain 'Beherit' ---
|
||||||
state_dir = os.path.expanduser("~/.local/state/webwatcher")
|
pattern = re.compile(
|
||||||
os.makedirs(state_dir, exist_ok=True)
|
r"product_id=(\d+)[^>]*>([^<]*Beherit[^<]*)</a>", re.IGNORECASE | re.DOTALL
|
||||||
return state_dir
|
)
|
||||||
|
products = {
|
||||||
|
pid: re.sub(r"\s+", " ", title).strip() for pid, title in pattern.findall(html)
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- load previous seen IDs ---
|
||||||
|
seen = set()
|
||||||
|
if os.path.exists(STATE):
|
||||||
|
with open(STATE) as f:
|
||||||
|
seen = set(json.load(f))
|
||||||
|
|
||||||
def state_paths(url):
|
# --- notify for new Beherit items ---
|
||||||
state_dir = ensure_dirs()
|
new = [(pid, title) for pid, title in products.items() if pid not in seen]
|
||||||
uhash = hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
|
for pid, title in new:
|
||||||
state_file = os.path.join(state_dir, f"{uhash}.json")
|
subprocess.run(
|
||||||
log_file = os.path.join(state_dir, "webwatcher.log")
|
["notify-send", "-u", "critical", "-t", "0", "Beherit Alert", title],
|
||||||
return state_file, log_file
|
check=False,
|
||||||
|
|
||||||
|
|
||||||
def load_state(path):
|
|
||||||
if os.path.exists(path):
|
|
||||||
try:
|
|
||||||
with open(path, "r") as f:
|
|
||||||
return json.load(f)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return {"seen": []}
|
|
||||||
|
|
||||||
|
|
||||||
def save_state(path, state):
|
|
||||||
# keep the seen set bounded
|
|
||||||
if len(state.get("seen", [])) > 2000:
|
|
||||||
state["seen"] = state["seen"][-1000:]
|
|
||||||
tmp = path + ".tmp"
|
|
||||||
with open(tmp, "w") as f:
|
|
||||||
json.dump(state, f)
|
|
||||||
os.replace(tmp, path)
|
|
||||||
|
|
||||||
|
|
||||||
def log_line(path, msg):
|
|
||||||
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
with open(path, "a") as f:
|
|
||||||
f.write(f"[{ts}] {msg}\n")
|
|
||||||
|
|
||||||
|
|
||||||
def fetch(url, timeout):
|
|
||||||
req = Request(url, headers={"User-Agent": "webwatcher/1.0"})
|
|
||||||
with urlopen(req, timeout=timeout) as r:
|
|
||||||
return r.read().decode("utf-8", errors="replace")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
p = argparse.ArgumentParser(
|
|
||||||
description="Scan a webpage for a regex, notify on new matches, and log."
|
|
||||||
)
|
)
|
||||||
p.add_argument("--url", required=True, help="URL to scan")
|
print(f"New Beherit item: {title}")
|
||||||
p.add_argument("--pattern", required=True, help="Regex to search for")
|
|
||||||
p.add_argument("--flags", default="", help="Regex flags: i,m,s (any combo)")
|
|
||||||
p.add_argument("--timeout", type=int, default=15)
|
|
||||||
args = p.parse_args()
|
|
||||||
|
|
||||||
flags = 0
|
# --- update state file ---
|
||||||
if "i" in args.flags.lower():
|
if new:
|
||||||
flags |= re.IGNORECASE
|
with open(STATE, "w") as f:
|
||||||
if "m" in args.flags.lower():
|
json.dump(sorted(products.keys()), f)
|
||||||
flags |= re.MULTILINE
|
|
||||||
if "s" in args.flags.lower():
|
|
||||||
flags |= re.DOTALL
|
|
||||||
|
|
||||||
state_file, log_file = state_paths(args.url)
|
|
||||||
state = load_state(state_file)
|
|
||||||
seen = set(state.get("seen", []))
|
|
||||||
|
|
||||||
try:
|
|
||||||
html = fetch(args.url, args.timeout)
|
|
||||||
except HTTPError as e:
|
|
||||||
log_line(log_file, f"ERROR fetch {args.url}: HTTP {e.code}")
|
|
||||||
sys.exit(1)
|
|
||||||
except URLError as e:
|
|
||||||
log_line(log_file, f"ERROR fetch {args.url}: {e}")
|
|
||||||
sys.exit(2)
|
|
||||||
except Exception as e:
|
|
||||||
log_line(log_file, f"ERROR fetch {args.url}: {e}")
|
|
||||||
sys.exit(3)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Capture surrounding context so multiple "Beherit" items are treated separately
|
|
||||||
context_window = 120 # number of characters around each match to include
|
|
||||||
unique = set()
|
|
||||||
|
|
||||||
for match in re.finditer(args.pattern, html, flags):
|
|
||||||
start = max(match.start() - context_window, 0)
|
|
||||||
end = min(match.end() + context_window, len(html))
|
|
||||||
snippet = html[start:end].replace("\n", " ")
|
|
||||||
unique.add(snippet.strip())
|
|
||||||
|
|
||||||
except re.error as e:
|
|
||||||
log_line(log_file, f"ERROR regex '{args.pattern}': {e}")
|
|
||||||
sys.exit(4)
|
|
||||||
|
|
||||||
new = [m for m in unique if m not in seen]
|
|
||||||
|
|
||||||
if new:
|
|
||||||
for m in sorted(new):
|
|
||||||
preview = (m[:250] + "…") if len(m) > 250 else m
|
|
||||||
notify(
|
|
||||||
"Web Watcher: new match",
|
|
||||||
preview,
|
|
||||||
urgency="critical",
|
|
||||||
)
|
|
||||||
log_line(log_file, f"NEW match url={args.url} match={preview}")
|
|
||||||
seen.add(m)
|
|
||||||
state["seen"] = list(seen)
|
|
||||||
save_state(state_file, state)
|
|
||||||
else:
|
|
||||||
log_line(log_file, f"OK no new matches url={args.url} (found={len(unique)})")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|||||||
Reference in New Issue
Block a user