Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions src/applypilot/discovery/jobspy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from jobspy import scrape_jobs

from applypilot import config
from applypilot.database import get_connection, init_db, store_jobs
from applypilot.database import get_connection, init_db

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -115,6 +115,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
return False


def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
"""Check if a job title passes the user's negative title filter."""
if not title or not exclude_titles:
return True
t_lower = title.lower()
for ex in exclude_titles:
if ex.lower() in t_lower:
return False
return True


# -- DB storage (JobSpy DataFrame -> SQLite) ---------------------------------

def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tuple[int, int]:
Expand All @@ -129,7 +140,6 @@ def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tup
continue

title = str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None
company = str(row.get("company", "")) if str(row.get("company", "")) != "nan" else None
location_str = str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None

# Build salary string from min/max
Expand Down Expand Up @@ -195,6 +205,7 @@ def _run_one_search(
accept_locs: list[str],
reject_locs: list[str],
glassdoor_map: dict,
exclude_titles: list[str],
) -> dict:
"""Run a single search query and store results in DB."""
s = search
Expand Down Expand Up @@ -268,11 +279,14 @@ def _run_one_search(
log.info("[%s] 0 results", label)
return {"new": 0, "existing": 0, "errors": 0, "filtered": 0, "total": 0, "label": label}

# Filter by location before storing
# Filter by location and title before storing
before = len(df)
df = df[df.apply(lambda row: _location_ok(
str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None,
accept_locs, reject_locs,
) and _title_ok(
str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None,
exclude_titles,
), axis=1)]
filtered = before - len(df)

Expand All @@ -281,7 +295,7 @@ def _run_one_search(

msg = f"[{label}] {before} results -> {new} new, {existing} dupes"
if filtered:
msg += f", {filtered} filtered (location)"
msg += f", {filtered} filtered (location/title)"
log.info(msg)

return {"new": new, "existing": existing, "errors": 0, "filtered": filtered, "total": before, "label": label}
Expand Down Expand Up @@ -377,6 +391,7 @@ def _full_crawl(
defaults = search_cfg.get("defaults", {})
glassdoor_map = search_cfg.get("glassdoor_location_map", {})
accept_locs, reject_locs = _load_location_config(search_cfg)
exclude_titles = search_cfg.get("exclude_titles", [])

if tiers:
queries = [q for q in queries if q.get("tier") in tiers]
Expand Down Expand Up @@ -411,7 +426,7 @@ def _full_crawl(
result = _run_one_search(
s, sites, results_per_site, hours_old,
proxy_config, defaults, max_retries,
accept_locs, reject_locs, glassdoor_map,
accept_locs, reject_locs, glassdoor_map, exclude_titles
)
completed += 1
total_new += result["new"]
Expand Down
38 changes: 27 additions & 11 deletions src/applypilot/discovery/smartextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,15 @@
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import quote_plus

import httpx
import yaml
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright

from applypilot import config
from applypilot.config import CONFIG_DIR
from applypilot.database import get_connection, init_db, store_jobs, get_stats
from applypilot.database import init_db, get_stats
from applypilot.llm import get_client

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -73,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
return False


def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
"""Check if a job title passes the user's negative title filter."""
if not title or not exclude_titles:
return True
t_lower = title.lower()
for ex in exclude_titles:
if ex.lower() in t_lower:
return False
return True


# -- Site configuration from YAML --------------------------------------------

def load_sites() -> list[dict]:
Expand All @@ -92,6 +101,7 @@ def _store_jobs_filtered(
strategy: str,
accept_locs: list[str],
reject_locs: list[str],
exclude_titles: list[str],
) -> tuple[int, int]:
"""Store jobs with location filtering. Returns (new, existing)."""
now = datetime.now(timezone.utc).isoformat()
Expand All @@ -106,6 +116,9 @@ def _store_jobs_filtered(
if not _location_ok(job.get("location"), accept_locs, reject_locs):
filtered += 1
continue
if not _title_ok(job.get("title"), exclude_titles):
filtered += 1
continue
try:
conn.execute(
"INSERT INTO jobs (url, title, salary, description, location, site, strategy, discovered_at) "
Expand All @@ -118,7 +131,7 @@ def _store_jobs_filtered(
existing += 1

if filtered:
log.info("Filtered %d jobs (wrong location)", filtered)
log.info("Filtered %d jobs (wrong location or title)", filtered)
conn.commit()
return new, existing

Expand Down Expand Up @@ -424,7 +437,7 @@ def format_strategy_briefing(intel: dict) -> str:
sections.append(f"\nJSON-LD: {len(job_postings)} JobPosting entries found (usable!)")
sections.append(f"First JobPosting:\n{json.dumps(job_postings[0], indent=2)[:3000]}")
else:
sections.append(f"\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
sections.append("\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
if other:
types = [j.get("@type", "?") if isinstance(j, dict) else "?" for j in other]
sections.append(f"Other JSON-LD types (NOT job data): {types}")
Expand All @@ -439,25 +452,25 @@ def format_strategy_briefing(intel: dict) -> str:
sections.append(f" Status: {resp['status']} | Size: {resp['size']:,} chars | Type: {resp.get('type', '?')}")
if "first_item_keys" in resp:
sections.append(f" Item keys: {resp['first_item_keys']}")
sections.append(f" Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:1000]}")
sections.append(f" Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:100]}")
if "keys" in resp:
sections.append(f" Object keys: {resp['keys']}")
for k, v in resp.items():
if k.startswith("nested_"):
arr_name = k.replace("nested_", "")
sections.append(f" .{arr_name}: array of {v['count']} items")
sections.append(f" Item keys: {v['first_item_keys']}")
sections.append(f" Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:1000]}")
sections.append(f" Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:100]}")
for sk, sv in v.items():
if sk.startswith("first_item.") and isinstance(sv, dict):
sub_name = sk.replace("first_item.", "")
if "count" in sv:
sections.append(f" .{arr_name}[0].{sub_name}: array of {sv['count']} items")
sections.append(f" Item keys: {sv['first_item_keys']}")
sections.append(f" Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:1500]}")
sections.append(f" Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:150]}")
elif "keys" in sv:
sections.append(f" .{arr_name}[0].{sub_name}: object with keys {sv['keys']}")
sections.append(f" Sample: {json.dumps(sv.get('sample', {}), indent=2)[:1500]}")
sections.append(f" Sample: {json.dumps(sv.get('sample', {}), indent=2)[:150]}")
else:
sections.append("\nAPI RESPONSES: none intercepted")

Expand Down Expand Up @@ -1016,6 +1029,7 @@ def _run_all(
targets: list[dict],
accept_locs: list[str],
reject_locs: list[str],
exclude_titles: list[str],
workers: int = 1,
) -> dict:
"""Run smart extract on all targets.
Expand All @@ -1038,7 +1052,8 @@ def _process_result(r: dict, target: dict) -> None:
if jobs:
new, existing = _store_jobs_filtered(conn, jobs, target["name"],
r.get("strategy", "?"),
accept_locs, reject_locs)
accept_locs, reject_locs,
exclude_titles)
total_new += new
total_existing += existing
log.info("DB: +%d new, %d already existed", new, existing)
Expand Down Expand Up @@ -1103,6 +1118,7 @@ def run_smart_extract(
"""
search_cfg = config.load_search_config()
accept_locs, reject_locs = _load_location_filter(search_cfg)
exclude_titles = search_cfg.get("exclude_titles", [])

targets = build_scrape_targets(sites=sites, search_cfg=search_cfg)

Expand All @@ -1115,4 +1131,4 @@ def run_smart_extract(
log.info("Sites: %d searchable, %d static | Total targets: %d (workers=%d)",
search_sites, static_sites, len(targets), workers)

return _run_all(targets, accept_locs, reject_locs, workers=workers)
return _run_all(targets, accept_locs, reject_locs, exclude_titles, workers=workers)
28 changes: 25 additions & 3 deletions src/applypilot/discovery/workday.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
return False


def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
"""Check if a job title passes the user's negative title filter."""
if not title or not exclude_titles:
return True
t_lower = title.lower()
for ex in exclude_titles:
if ex.lower() in t_lower:
return False
return True


# -- HTML stripper -----------------------------------------------------------

class _HTMLStripper(HTMLParser):
Expand Down Expand Up @@ -194,6 +205,7 @@ def search_employer(
max_results: int = 0,
accept_locs: list[str] | None = None,
reject_locs: list[str] | None = None,
exclude_titles: list[str] | None = None,
) -> list[dict]:
"""Search an employer, paginate through all results, optionally filter by location."""
log.info("%s: searching \"%s\"...", employer["name"], search_text)
Expand Down Expand Up @@ -225,6 +237,9 @@ def search_employer(
if not _location_ok(loc, accept_locs, reject_locs):
continue

if exclude_titles and not _title_ok(j.get("title", ""), exclude_titles):
continue

all_jobs.append({
"title": j.get("title", ""),
"location": loc,
Expand All @@ -246,7 +261,7 @@ def search_employer(
break

log.info("%s: %d jobs found%s", employer["name"], len(all_jobs),
" (filtered)" if location_filter else "")
" (filtered)" if location_filter or exclude_titles else "")
return all_jobs


Expand Down Expand Up @@ -347,6 +362,7 @@ def _process_one(
location_filter: bool,
accept_locs: list[str],
reject_locs: list[str],
exclude_titles: list[str],
) -> dict:
"""Search one employer, fetch details, store results."""
emp = employers[employer_key]
Expand All @@ -357,6 +373,7 @@ def _process_one(
location_filter=location_filter,
accept_locs=accept_locs,
reject_locs=reject_locs,
exclude_titles=exclude_titles,
)
except Exception as e:
log.error("%s: ERROR searching '%s': %s", emp["name"], search_text, e)
Expand Down Expand Up @@ -390,6 +407,7 @@ def scrape_employers(
max_results: int = 0,
accept_locs: list[str] | None = None,
reject_locs: list[str] | None = None,
exclude_titles: list[str] | None = None,
workers: int = 1,
) -> dict:
"""Run full scrape: search -> filter -> detail -> store.
Expand All @@ -404,6 +422,8 @@ def scrape_employers(
accept_locs = []
if reject_locs is None:
reject_locs = []
if exclude_titles is None:
exclude_titles = []

# Ensure DB schema
init_db()
Expand All @@ -423,7 +443,7 @@ def scrape_employers(
futures = {
pool.submit(
_process_one, key, employers, search_text,
location_filter, accept_locs, reject_locs,
location_filter, accept_locs, reject_locs, exclude_titles
): key
for key in valid_keys
}
Expand All @@ -446,7 +466,7 @@ def scrape_employers(
for key in valid_keys:
result = _process_one(
key, employers, search_text,
location_filter, accept_locs, reject_locs,
location_filter, accept_locs, reject_locs, exclude_titles
)
completed += 1
total_new += result["new"]
Expand Down Expand Up @@ -493,6 +513,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di
search_cfg = config.load_search_config()
queries_cfg = search_cfg.get("queries", [])
accept_locs, reject_locs = _load_location_filter(search_cfg)
exclude_titles = search_cfg.get("exclude_titles", [])

# Default to tier 1-2 queries for workday scraping
max_tier = search_cfg.get("workday_max_tier", 2)
Expand Down Expand Up @@ -526,6 +547,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di
location_filter=location_filter,
accept_locs=accept_locs,
reject_locs=reject_locs,
exclude_titles=exclude_titles,
workers=workers,
)
grand_new += result["new"]
Expand Down