diff --git a/src/applypilot/discovery/jobspy.py b/src/applypilot/discovery/jobspy.py index b5e54ff4..eb67722d 100644 --- a/src/applypilot/discovery/jobspy.py +++ b/src/applypilot/discovery/jobspy.py @@ -15,7 +15,7 @@ from jobspy import scrape_jobs from applypilot import config -from applypilot.database import get_connection, init_db, store_jobs +from applypilot.database import get_connection, init_db log = logging.getLogger(__name__) @@ -115,6 +115,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) -> return False +def _title_ok(title: str | None, exclude_titles: list[str]) -> bool: + """Check if a job title passes the user's negative title filter.""" + if not title or not exclude_titles: + return True + t_lower = title.lower() + for ex in exclude_titles: + if ex.lower() in t_lower: + return False + return True + + # -- DB storage (JobSpy DataFrame -> SQLite) --------------------------------- def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tuple[int, int]: @@ -129,7 +140,6 @@ def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tup continue title = str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None - company = str(row.get("company", "")) if str(row.get("company", "")) != "nan" else None location_str = str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None # Build salary string from min/max @@ -195,6 +205,7 @@ def _run_one_search( accept_locs: list[str], reject_locs: list[str], glassdoor_map: dict, + exclude_titles: list[str], ) -> dict: """Run a single search query and store results in DB.""" s = search @@ -268,11 +279,14 @@ def _run_one_search( log.info("[%s] 0 results", label) return {"new": 0, "existing": 0, "errors": 0, "filtered": 0, "total": 0, "label": label} - # Filter by location before storing + # Filter by location and title before storing before = len(df) df = df[df.apply(lambda row: _location_ok( str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None, accept_locs, reject_locs, + ) and _title_ok( + str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None, + exclude_titles, ), axis=1)] filtered = before - len(df) @@ -281,7 +295,7 @@ def _run_one_search( msg = f"[{label}] {before} results -> {new} new, {existing} dupes" if filtered: - msg += f", {filtered} filtered (location)" + msg += f", {filtered} filtered (location/title)" log.info(msg) return {"new": new, "existing": existing, "errors": 0, "filtered": filtered, "total": before, "label": label} @@ -377,6 +391,7 @@ def _full_crawl( defaults = search_cfg.get("defaults", {}) glassdoor_map = search_cfg.get("glassdoor_location_map", {}) accept_locs, reject_locs = _load_location_config(search_cfg) + exclude_titles = search_cfg.get("exclude_titles", []) if tiers: queries = [q for q in queries if q.get("tier") in tiers] @@ -411,7 +426,7 @@ def _full_crawl( result = _run_one_search( s, sites, results_per_site, hours_old, proxy_config, defaults, max_retries, - accept_locs, reject_locs, glassdoor_map, + accept_locs, reject_locs, glassdoor_map, exclude_titles ) completed += 1 total_new += result["new"] diff --git a/src/applypilot/discovery/smartextract.py b/src/applypilot/discovery/smartextract.py index cf49a9a2..5661592b 100644 --- a/src/applypilot/discovery/smartextract.py +++ b/src/applypilot/discovery/smartextract.py @@ -20,17 +20,15 @@ import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone -from pathlib import Path from urllib.parse import quote_plus -import httpx import yaml from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright from applypilot import config from applypilot.config import CONFIG_DIR -from applypilot.database import get_connection, init_db, store_jobs, get_stats +from applypilot.database import init_db, get_stats from applypilot.llm import get_client log = logging.getLogger(__name__) @@ -73,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) -> return False +def _title_ok(title: str | None, exclude_titles: list[str]) -> bool: + """Check if a job title passes the user's negative title filter.""" + if not title or not exclude_titles: + return True + t_lower = title.lower() + for ex in exclude_titles: + if ex.lower() in t_lower: + return False + return True + + # -- Site configuration from YAML -------------------------------------------- def load_sites() -> list[dict]: @@ -92,6 +101,7 @@ def _store_jobs_filtered( strategy: str, accept_locs: list[str], reject_locs: list[str], + exclude_titles: list[str], ) -> tuple[int, int]: """Store jobs with location filtering. Returns (new, existing).""" now = datetime.now(timezone.utc).isoformat() @@ -106,6 +116,9 @@ def _store_jobs_filtered( if not _location_ok(job.get("location"), accept_locs, reject_locs): filtered += 1 continue + if not _title_ok(job.get("title"), exclude_titles): + filtered += 1 + continue try: conn.execute( "INSERT INTO jobs (url, title, salary, description, location, site, strategy, discovered_at) " @@ -118,7 +131,7 @@ def _store_jobs_filtered( existing += 1 if filtered: - log.info("Filtered %d jobs (wrong location)", filtered) + log.info("Filtered %d jobs (wrong location or title)", filtered) conn.commit() return new, existing @@ -424,7 +437,7 @@ def format_strategy_briefing(intel: dict) -> str: sections.append(f"\nJSON-LD: {len(job_postings)} JobPosting entries found (usable!)") sections.append(f"First JobPosting:\n{json.dumps(job_postings[0], indent=2)[:3000]}") else: - sections.append(f"\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)") + sections.append("\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)") if other: types = [j.get("@type", "?") if isinstance(j, dict) else "?" for j in other] sections.append(f"Other JSON-LD types (NOT job data): {types}") @@ -439,7 +452,7 @@ def format_strategy_briefing(intel: dict) -> str: sections.append(f" Status: {resp['status']} | Size: {resp['size']:,} chars | Type: {resp.get('type', '?')}") if "first_item_keys" in resp: sections.append(f" Item keys: {resp['first_item_keys']}") - sections.append(f" Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:1000]}") + sections.append(f" Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:100]}") if "keys" in resp: sections.append(f" Object keys: {resp['keys']}") for k, v in resp.items(): @@ -447,17 +460,17 @@ def format_strategy_briefing(intel: dict) -> str: arr_name = k.replace("nested_", "") sections.append(f" .{arr_name}: array of {v['count']} items") sections.append(f" Item keys: {v['first_item_keys']}") - sections.append(f" Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:1000]}") + sections.append(f" Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:100]}") for sk, sv in v.items(): if sk.startswith("first_item.") and isinstance(sv, dict): sub_name = sk.replace("first_item.", "") if "count" in sv: sections.append(f" .{arr_name}[0].{sub_name}: array of {sv['count']} items") sections.append(f" Item keys: {sv['first_item_keys']}") - sections.append(f" Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:1500]}") + sections.append(f" Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:150]}") elif "keys" in sv: sections.append(f" .{arr_name}[0].{sub_name}: object with keys {sv['keys']}") - sections.append(f" Sample: {json.dumps(sv.get('sample', {}), indent=2)[:1500]}") + sections.append(f" Sample: {json.dumps(sv.get('sample', {}), indent=2)[:150]}") else: sections.append("\nAPI RESPONSES: none intercepted") @@ -1016,6 +1029,7 @@ def _run_all( targets: list[dict], accept_locs: list[str], reject_locs: list[str], + exclude_titles: list[str], workers: int = 1, ) -> dict: """Run smart extract on all targets. @@ -1038,7 +1052,8 @@ def _process_result(r: dict, target: dict) -> None: if jobs: new, existing = _store_jobs_filtered(conn, jobs, target["name"], r.get("strategy", "?"), - accept_locs, reject_locs) + accept_locs, reject_locs, + exclude_titles) total_new += new total_existing += existing log.info("DB: +%d new, %d already existed", new, existing) @@ -1103,6 +1118,7 @@ def run_smart_extract( """ search_cfg = config.load_search_config() accept_locs, reject_locs = _load_location_filter(search_cfg) + exclude_titles = search_cfg.get("exclude_titles", []) targets = build_scrape_targets(sites=sites, search_cfg=search_cfg) @@ -1115,4 +1131,4 @@ def run_smart_extract( log.info("Sites: %d searchable, %d static | Total targets: %d (workers=%d)", search_sites, static_sites, len(targets), workers) - return _run_all(targets, accept_locs, reject_locs, workers=workers) + return _run_all(targets, accept_locs, reject_locs, exclude_titles, workers=workers) diff --git a/src/applypilot/discovery/workday.py b/src/applypilot/discovery/workday.py index cef69fe4..d3621cca 100644 --- a/src/applypilot/discovery/workday.py +++ b/src/applypilot/discovery/workday.py @@ -71,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) -> return False +def _title_ok(title: str | None, exclude_titles: list[str]) -> bool: + """Check if a job title passes the user's negative title filter.""" + if not title or not exclude_titles: + return True + t_lower = title.lower() + for ex in exclude_titles: + if ex.lower() in t_lower: + return False + return True + + # -- HTML stripper ----------------------------------------------------------- class _HTMLStripper(HTMLParser): @@ -194,6 +205,7 @@ def search_employer( max_results: int = 0, accept_locs: list[str] | None = None, reject_locs: list[str] | None = None, + exclude_titles: list[str] | None = None, ) -> list[dict]: """Search an employer, paginate through all results, optionally filter by location.""" log.info("%s: searching \"%s\"...", employer["name"], search_text) @@ -225,6 +237,9 @@ def search_employer( if not _location_ok(loc, accept_locs, reject_locs): continue + if exclude_titles and not _title_ok(j.get("title", ""), exclude_titles): + continue + all_jobs.append({ "title": j.get("title", ""), "location": loc, @@ -246,7 +261,7 @@ def search_employer( break log.info("%s: %d jobs found%s", employer["name"], len(all_jobs), - " (filtered)" if location_filter else "") + " (filtered)" if location_filter or exclude_titles else "") return all_jobs @@ -347,6 +362,7 @@ def _process_one( location_filter: bool, accept_locs: list[str], reject_locs: list[str], + exclude_titles: list[str], ) -> dict: """Search one employer, fetch details, store results.""" emp = employers[employer_key] @@ -357,6 +373,7 @@ def _process_one( location_filter=location_filter, accept_locs=accept_locs, reject_locs=reject_locs, + exclude_titles=exclude_titles, ) except Exception as e: log.error("%s: ERROR searching '%s': %s", emp["name"], search_text, e) @@ -390,6 +407,7 @@ def scrape_employers( max_results: int = 0, accept_locs: list[str] | None = None, reject_locs: list[str] | None = None, + exclude_titles: list[str] | None = None, workers: int = 1, ) -> dict: """Run full scrape: search -> filter -> detail -> store. @@ -404,6 +422,8 @@ def scrape_employers( accept_locs = [] if reject_locs is None: reject_locs = [] + if exclude_titles is None: + exclude_titles = [] # Ensure DB schema init_db() @@ -423,7 +443,7 @@ def scrape_employers( futures = { pool.submit( _process_one, key, employers, search_text, - location_filter, accept_locs, reject_locs, + location_filter, accept_locs, reject_locs, exclude_titles ): key for key in valid_keys } @@ -446,7 +466,7 @@ def scrape_employers( for key in valid_keys: result = _process_one( key, employers, search_text, - location_filter, accept_locs, reject_locs, + location_filter, accept_locs, reject_locs, exclude_titles ) completed += 1 total_new += result["new"] @@ -493,6 +513,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di search_cfg = config.load_search_config() queries_cfg = search_cfg.get("queries", []) accept_locs, reject_locs = _load_location_filter(search_cfg) + exclude_titles = search_cfg.get("exclude_titles", []) # Default to tier 1-2 queries for workday scraping max_tier = search_cfg.get("workday_max_tier", 2) @@ -526,6 +547,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di location_filter=location_filter, accept_locs=accept_locs, reject_locs=reject_locs, + exclude_titles=exclude_titles, workers=workers, ) grand_new += result["new"]