Pickle-Pixel · strangedef · Jun 10, 2026
diff --git a/src/applypilot/discovery/jobspy.py b/src/applypilot/discovery/jobspy.py
@@ -15,7 +15,7 @@
 from jobspy import scrape_jobs
 
 from applypilot import config
-from applypilot.database import get_connection, init_db, store_jobs
+from applypilot.database import get_connection, init_db
 
 log = logging.getLogger(__name__)
 
@@ -115,6 +115,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
     return False
 
 
+def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
+    """Check if a job title passes the user's negative title filter."""
+    if not title or not exclude_titles:
+        return True
+    t_lower = title.lower()
+    for ex in exclude_titles:
+        if ex.lower() in t_lower:
+            return False
+    return True
+
+
 # -- DB storage (JobSpy DataFrame -> SQLite) ---------------------------------
 
 def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tuple[int, int]:
@@ -129,7 +140,6 @@ def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tup
             continue
 
         title = str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None
-        company = str(row.get("company", "")) if str(row.get("company", "")) != "nan" else None
         location_str = str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None
 
         # Build salary string from min/max
@@ -195,6 +205,7 @@ def _run_one_search(
     accept_locs: list[str],
     reject_locs: list[str],
     glassdoor_map: dict,
+    exclude_titles: list[str],
 ) -> dict:
     """Run a single search query and store results in DB."""
     s = search
@@ -268,11 +279,14 @@ def _run_one_search(
         log.info("[%s] 0 results", label)
         return {"new": 0, "existing": 0, "errors": 0, "filtered": 0, "total": 0, "label": label}
 
-    # Filter by location before storing
+    # Filter by location and title before storing
     before = len(df)
     df = df[df.apply(lambda row: _location_ok(
         str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None,
         accept_locs, reject_locs,
+    ) and _title_ok(
+        str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None,
+        exclude_titles,
     ), axis=1)]
     filtered = before - len(df)
 
@@ -281,7 +295,7 @@ def _run_one_search(
 
     msg = f"[{label}] {before} results -> {new} new, {existing} dupes"
     if filtered:
-        msg += f", {filtered} filtered (location)"
+        msg += f", {filtered} filtered (location/title)"
     log.info(msg)
 
     return {"new": new, "existing": existing, "errors": 0, "filtered": filtered, "total": before, "label": label}
@@ -377,6 +391,7 @@ def _full_crawl(
     defaults = search_cfg.get("defaults", {})
     glassdoor_map = search_cfg.get("glassdoor_location_map", {})
     accept_locs, reject_locs = _load_location_config(search_cfg)
+    exclude_titles = search_cfg.get("exclude_titles", [])
 
     if tiers:
         queries = [q for q in queries if q.get("tier") in tiers]
@@ -411,7 +426,7 @@ def _full_crawl(
         result = _run_one_search(
             s, sites, results_per_site, hours_old,
             proxy_config, defaults, max_retries,
-            accept_locs, reject_locs, glassdoor_map,
+            accept_locs, reject_locs, glassdoor_map, exclude_titles
         )
         completed += 1
         total_new += result["new"]

diff --git a/src/applypilot/discovery/smartextract.py b/src/applypilot/discovery/smartextract.py
@@ -20,17 +20,15 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
-from pathlib import Path
 from urllib.parse import quote_plus
 
-import httpx
 import yaml
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
 
 from applypilot import config
 from applypilot.config import CONFIG_DIR
-from applypilot.database import get_connection, init_db, store_jobs, get_stats
+from applypilot.database import init_db, get_stats
 from applypilot.llm import get_client
 
 log = logging.getLogger(__name__)
@@ -73,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
     return False
 
 
+def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
+    """Check if a job title passes the user's negative title filter."""
+    if not title or not exclude_titles:
+        return True
+    t_lower = title.lower()
+    for ex in exclude_titles:
+        if ex.lower() in t_lower:
+            return False
+    return True
+
+
 # -- Site configuration from YAML --------------------------------------------
 
 def load_sites() -> list[dict]:
@@ -92,6 +101,7 @@ def _store_jobs_filtered(
     strategy: str,
     accept_locs: list[str],
     reject_locs: list[str],
+    exclude_titles: list[str],
 ) -> tuple[int, int]:
     """Store jobs with location filtering. Returns (new, existing)."""
     now = datetime.now(timezone.utc).isoformat()
@@ -106,6 +116,9 @@ def _store_jobs_filtered(
         if not _location_ok(job.get("location"), accept_locs, reject_locs):
             filtered += 1
             continue
+        if not _title_ok(job.get("title"), exclude_titles):
+            filtered += 1
+            continue
         try:
             conn.execute(
                 "INSERT INTO jobs (url, title, salary, description, location, site, strategy, discovered_at) "
@@ -118,7 +131,7 @@ def _store_jobs_filtered(
             existing += 1
 
     if filtered:
-        log.info("Filtered %d jobs (wrong location)", filtered)
+        log.info("Filtered %d jobs (wrong location or title)", filtered)
     conn.commit()
     return new, existing
 
@@ -424,7 +437,7 @@ def format_strategy_briefing(intel: dict) -> str:
             sections.append(f"\nJSON-LD: {len(job_postings)} JobPosting entries found (usable!)")
             sections.append(f"First JobPosting:\n{json.dumps(job_postings[0], indent=2)[:3000]}")
         else:
-            sections.append(f"\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
+            sections.append("\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
         if other:
             types = [j.get("@type", "?") if isinstance(j, dict) else "?" for j in other]
             sections.append(f"Other JSON-LD types (NOT job data): {types}")
@@ -439,25 +452,25 @@ def format_strategy_briefing(intel: dict) -> str:
             sections.append(f"  Status: {resp['status']} | Size: {resp['size']:,} chars | Type: {resp.get('type', '?')}")
             if "first_item_keys" in resp:
                 sections.append(f"  Item keys: {resp['first_item_keys']}")
-                sections.append(f"  Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:1000]}")
+                sections.append(f"  Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:100]}")
             if "keys" in resp:
                 sections.append(f"  Object keys: {resp['keys']}")
             for k, v in resp.items():
                 if k.startswith("nested_"):
                     arr_name = k.replace("nested_", "")
                     sections.append(f"  .{arr_name}: array of {v['count']} items")
                     sections.append(f"    Item keys: {v['first_item_keys']}")
-                    sections.append(f"    Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:1000]}")
+                    sections.append(f"    Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:100]}")
                     for sk, sv in v.items():
                         if sk.startswith("first_item.") and isinstance(sv, dict):
                             sub_name = sk.replace("first_item.", "")
                             if "count" in sv:
                                 sections.append(f"    .{arr_name}[0].{sub_name}: array of {sv['count']} items")
                                 sections.append(f"      Item keys: {sv['first_item_keys']}")
-                                sections.append(f"      Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:1500]}")
+                                sections.append(f"      Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:150]}")
                             elif "keys" in sv:
                                 sections.append(f"    .{arr_name}[0].{sub_name}: object with keys {sv['keys']}")
-                                sections.append(f"      Sample: {json.dumps(sv.get('sample', {}), indent=2)[:1500]}")
+                                sections.append(f"      Sample: {json.dumps(sv.get('sample', {}), indent=2)[:150]}")
     else:
         sections.append("\nAPI RESPONSES: none intercepted")
 
@@ -1016,6 +1029,7 @@ def _run_all(
     targets: list[dict],
     accept_locs: list[str],
     reject_locs: list[str],
+    exclude_titles: list[str],
     workers: int = 1,
 ) -> dict:
     """Run smart extract on all targets.
@@ -1038,7 +1052,8 @@ def _process_result(r: dict, target: dict) -> None:
         if jobs:
             new, existing = _store_jobs_filtered(conn, jobs, target["name"],
                                                   r.get("strategy", "?"),
-                                                  accept_locs, reject_locs)
+                                                  accept_locs, reject_locs,
+                                                  exclude_titles)
             total_new += new
             total_existing += existing
             log.info("DB: +%d new, %d already existed", new, existing)
@@ -1103,6 +1118,7 @@ def run_smart_extract(
     """
     search_cfg = config.load_search_config()
     accept_locs, reject_locs = _load_location_filter(search_cfg)
+    exclude_titles = search_cfg.get("exclude_titles", [])
 
     targets = build_scrape_targets(sites=sites, search_cfg=search_cfg)
 
@@ -1115,4 +1131,4 @@ def run_smart_extract(
     log.info("Sites: %d searchable, %d static | Total targets: %d (workers=%d)",
              search_sites, static_sites, len(targets), workers)
 
-    return _run_all(targets, accept_locs, reject_locs, workers=workers)
+    return _run_all(targets, accept_locs, reject_locs, exclude_titles, workers=workers)
diff --git a/src/applypilot/discovery/workday.py b/src/applypilot/discovery/workday.py
@@ -71,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
     return False
 
 
+def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
+    """Check if a job title passes the user's negative title filter."""
+    if not title or not exclude_titles:
+        return True
+    t_lower = title.lower()
+    for ex in exclude_titles:
+        if ex.lower() in t_lower:
+            return False
+    return True
+
+
 # -- HTML stripper -----------------------------------------------------------
 
 class _HTMLStripper(HTMLParser):
@@ -194,6 +205,7 @@ def search_employer(
     max_results: int = 0,
     accept_locs: list[str] | None = None,
     reject_locs: list[str] | None = None,
+    exclude_titles: list[str] | None = None,
 ) -> list[dict]:
     """Search an employer, paginate through all results, optionally filter by location."""
     log.info("%s: searching \"%s\"...", employer["name"], search_text)
@@ -225,6 +237,9 @@ def search_employer(
                 if not _location_ok(loc, accept_locs, reject_locs):
                     continue
 
+            if exclude_titles and not _title_ok(j.get("title", ""), exclude_titles):
+                continue
+
             all_jobs.append({
                 "title": j.get("title", ""),
                 "location": loc,
@@ -246,7 +261,7 @@ def search_employer(
             break
 
     log.info("%s: %d jobs found%s", employer["name"], len(all_jobs),
-             " (filtered)" if location_filter else "")
+             " (filtered)" if location_filter or exclude_titles else "")
     return all_jobs
 
 
@@ -347,6 +362,7 @@ def _process_one(
     location_filter: bool,
     accept_locs: list[str],
     reject_locs: list[str],
+    exclude_titles: list[str],
 ) -> dict:
     """Search one employer, fetch details, store results."""
     emp = employers[employer_key]
@@ -357,6 +373,7 @@ def _process_one(
             location_filter=location_filter,
             accept_locs=accept_locs,
             reject_locs=reject_locs,
+            exclude_titles=exclude_titles,
         )
     except Exception as e:
         log.error("%s: ERROR searching '%s': %s", emp["name"], search_text, e)
@@ -390,6 +407,7 @@ def scrape_employers(
     max_results: int = 0,
     accept_locs: list[str] | None = None,
     reject_locs: list[str] | None = None,
+    exclude_titles: list[str] | None = None,
     workers: int = 1,
 ) -> dict:
     """Run full scrape: search -> filter -> detail -> store.
@@ -404,6 +422,8 @@ def scrape_employers(
         accept_locs = []
     if reject_locs is None:
         reject_locs = []
+    if exclude_titles is None:
+        exclude_titles = []
 
     # Ensure DB schema
     init_db()
@@ -423,7 +443,7 @@ def scrape_employers(
             futures = {
                 pool.submit(
                     _process_one, key, employers, search_text,
-                    location_filter, accept_locs, reject_locs,
+                    location_filter, accept_locs, reject_locs, exclude_titles
                 ): key
                 for key in valid_keys
             }
@@ -446,7 +466,7 @@ def scrape_employers(
         for key in valid_keys:
             result = _process_one(
                 key, employers, search_text,
-                location_filter, accept_locs, reject_locs,
+                location_filter, accept_locs, reject_locs, exclude_titles
             )
             completed += 1
             total_new += result["new"]
@@ -493,6 +513,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di
     search_cfg = config.load_search_config()
     queries_cfg = search_cfg.get("queries", [])
     accept_locs, reject_locs = _load_location_filter(search_cfg)
+    exclude_titles = search_cfg.get("exclude_titles", [])
 
     # Default to tier 1-2 queries for workday scraping
     max_tier = search_cfg.get("workday_max_tier", 2)
@@ -526,6 +547,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di
             location_filter=location_filter,
             accept_locs=accept_locs,
             reject_locs=reject_locs,
+            exclude_titles=exclude_titles,
             workers=workers,
         )
         grand_new += result["new"]