From edfe4fa2024999d9b7803acd8ead9e6b62adedb9 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Jun 2026 04:18:06 +0000 Subject: [PATCH 1/2] Add network-mocked tests for PCLClient (pcl.py -> 100% coverage) Cover the PCL API client end-to-end with mocked requests.Session and a patched authenticate(): exception hierarchy, token property/auth, the _make_request branches (GET/POST/DELETE, 401 retry, 404/406/500, token refresh, network errors), immediate searches, pagination, and all batch job operations. https://claude.ai/code/session_01NNvhsYRVWhjfcdgaSmU5bt --- tests/test_pcl.py | 381 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 tests/test_pcl.py diff --git a/tests/test_pcl.py b/tests/test_pcl.py new file mode 100644 index 0000000..8b31d3a --- /dev/null +++ b/tests/test_pcl.py @@ -0,0 +1,381 @@ +"""Tests for the PACER Case Locator (PCL) API client. + +All network access is mocked: the client's `_session` is replaced with a +MagicMock and `pacer_cli.pcl.authenticate` is patched so no real login occurs. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from pacer_cli.auth import AuthResult +from pacer_cli.config import PacerConfig +from pacer_cli.models import ( + CaseSearchCriteria, + PartySearchCriteria, +) +from pacer_cli.pcl import ( + PCLAuthError, + PCLClient, + PCLError, + PCLNotFoundError, + PCLValidationError, +) + + +@pytest.fixture +def config() -> PacerConfig: + return PacerConfig(username="user", password="pass") + + +@pytest.fixture +def client(config): + """A PCLClient with a mocked session and patched authentication.""" + with patch("pacer_cli.pcl.authenticate") as mock_auth: + mock_auth.return_value = AuthResult(success=True, token="tok-" + "x" * 124) + c = PCLClient(config) + c._session = MagicMock(spec=requests.Session) + # default token so requests don't trigger auth unless we clear it + c._token = "tok-" + "x" * 124 + c._mock_auth = mock_auth + yield c + + +def _resp(status_code=200, json_data=None, text="", headers=None): + resp = MagicMock(spec=requests.Response) + resp.status_code = status_code + resp.json.return_value = json_data if json_data is not None else {} + resp.text = text + resp.headers = headers or {} + if status_code >= 400: + resp.raise_for_status.side_effect = requests.HTTPError(response=resp) + else: + resp.raise_for_status.return_value = None + return resp + + +# Minimal realistic PCL payloads ------------------------------------------------- + +CASE_PAGE_LAST = { + "receipt": { + "transactionDate": "2026-06-10", + "billablePages": 1, + "loginId": "user", + "search": "Case Search", + "searchFee": "0.10", + }, + "pageInfo": { + "number": 0, + "size": 54, + "totalPages": 1, + "totalElements": 1, + "numberOfElements": 1, + "first": True, + "last": True, + }, + "content": [ + { + "courtId": "nysd", + "caseId": 500997, + "caseNumberFull": "1:18-cv-08434", + "caseTitle": "Apple Inc. v. Samsung", + "dateFiled": "2018-09-15", + "caseLink": "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", + } + ], +} + +PARTY_PAGE_LAST = { + "receipt": {"billablePages": 1, "searchFee": "0.10"}, + "pageInfo": { + "number": 0, + "size": 54, + "totalPages": 1, + "totalElements": 1, + "numberOfElements": 1, + "first": True, + "last": True, + }, + "content": [ + { + "courtId": "nysd", + "caseId": 1, + "lastName": "Smith", + "firstName": "John", + "partyType": "Plaintiff", + } + ], +} + + +def _case_page(number, last): + return { + "pageInfo": { + "number": number, + "size": 54, + "totalPages": 3, + "totalElements": 150, + "numberOfElements": 54, + "first": number == 0, + "last": last, + }, + "content": [{"courtId": "nysd", "caseId": number}], + } + + +# --------------------------------------------------------------------------- +# Exception classes +# --------------------------------------------------------------------------- + + +class TestExceptions: + def test_hierarchy(self): + assert issubclass(PCLAuthError, PCLError) + assert issubclass(PCLValidationError, PCLError) + assert issubclass(PCLNotFoundError, PCLError) + assert issubclass(PCLError, Exception) + + +# --------------------------------------------------------------------------- +# Authentication / token property +# --------------------------------------------------------------------------- + + +class TestAuthentication: + def test_token_property_authenticates_when_missing(self, config): + with patch("pacer_cli.pcl.authenticate") as mock_auth: + mock_auth.return_value = AuthResult(success=True, token="abc") + c = PCLClient(config) + c._session = MagicMock(spec=requests.Session) + assert c.token == "abc" + mock_auth.assert_called_once() + + def test_token_property_cached(self, client): + # token already set in fixture; property returns it without re-auth + assert client.token.startswith("tok-") + client._mock_auth.assert_not_called() + + def test_authenticate_failure_raises(self, config): + with patch("pacer_cli.pcl.authenticate") as mock_auth: + mock_auth.return_value = AuthResult(success=False, error="bad creds") + c = PCLClient(config) + c._session = MagicMock(spec=requests.Session) + with pytest.raises(PCLAuthError, match="bad creds"): + _ = c.token + + def test_post_init_sets_json_headers(self, config): + with patch("pacer_cli.pcl.authenticate"): + c = PCLClient(config) + assert c._session.headers["Accept"] == "application/json" + assert c._session.headers["Content-Type"] == "application/json" + + +# --------------------------------------------------------------------------- +# _make_request behaviors +# --------------------------------------------------------------------------- + + +class TestMakeRequest: + def test_get_request_success(self, client): + client._session.get.return_value = _resp(json_data={"ok": True}) + resp = client._make_request("GET", "/cases/reports") + assert resp.json() == {"ok": True} + url = client._session.get.call_args[0][0] + assert url.endswith("/cases/reports") + + def test_post_includes_payload(self, client): + client._session.post.return_value = _resp(json_data={}) + client._make_request("POST", "/cases/find", payload={"a": 1}) + assert client._session.post.call_args.kwargs["json"] == {"a": 1} + + def test_delete_request(self, client): + client._session.delete.return_value = _resp(status_code=204) + resp = client._make_request("DELETE", "/cases/reports/5") + assert resp.status_code == 204 + + def test_unsupported_method_raises(self, client): + with pytest.raises(ValueError, match="Unsupported HTTP method"): + client._make_request("PATCH", "/x") + + def test_client_code_header(self, config): + with patch("pacer_cli.pcl.authenticate"): + c = PCLClient(config) + c.config.client_code = "MATTER-1" + c._session = MagicMock(spec=requests.Session) + c._token = "tok" + c._session.get.return_value = _resp(json_data={}) + c._make_request("GET", "/x") + headers = c._session.get.call_args.kwargs["headers"] + assert headers["X-CLIENT-CODE"] == "MATTER-1" + + def test_token_refresh_from_response_header(self, client): + client._session.get.return_value = _resp( + json_data={}, headers={"X-NEXT-GEN-CSO": "newtoken"} + ) + client._make_request("GET", "/x") + assert client._token == "newtoken" + + def test_404_raises_not_found(self, client): + client._session.get.return_value = _resp(status_code=404) + with pytest.raises(PCLNotFoundError): + client._make_request("GET", "/cases/missing") + + def test_406_validation_error_with_json_message(self, client): + client._session.post.return_value = _resp( + status_code=406, json_data={"message": "bad field"} + ) + with pytest.raises(PCLValidationError, match="bad field"): + client._make_request("POST", "/cases/find", payload={}) + + def test_406_validation_error_with_text_fallback(self, client): + resp = _resp(status_code=406, text="plain text error") + resp.json.side_effect = ValueError("not json") + client._session.post.return_value = resp + with pytest.raises(PCLValidationError, match="plain text error"): + client._make_request("POST", "/cases/find", payload={}) + + def test_500_raises_pclerror_via_raise_for_status(self, client): + client._session.get.return_value = _resp(status_code=500) + with pytest.raises(PCLError): + client._make_request("GET", "/x") + + def test_network_exception_wrapped(self, client): + client._session.get.side_effect = requests.ConnectionError("down") + with pytest.raises(PCLError, match="Network error"): + client._make_request("GET", "/x") + + def test_401_retries_with_fresh_auth_then_succeeds(self, client): + first = _resp(status_code=401) + second = _resp(json_data={"ok": True}) + client._session.get.side_effect = [first, second] + client._mock_auth.return_value = AuthResult(success=True, token="fresh") + resp = client._make_request("GET", "/x") + assert resp.json() == {"ok": True} + # token cleared then re-authenticated + client._mock_auth.assert_called_once() + + def test_401_after_retry_raises(self, client): + # retry_auth False path: a 401 with retry disabled + client._session.get.return_value = _resp(status_code=401) + with pytest.raises(PCLAuthError, match="after retry"): + client._make_request("GET", "/x", retry_auth=False) + + +# --------------------------------------------------------------------------- +# Immediate searches +# --------------------------------------------------------------------------- + + +class TestImmediateSearches: + def test_search_cases(self, client): + client._session.post.return_value = _resp(json_data=CASE_PAGE_LAST) + result = client.search_cases(CaseSearchCriteria(case_title="Apple")) + assert len(result.content) == 1 + assert result.content[0].case_link.endswith("500997") + assert result.receipt.fee_cents == 10 + # endpoint includes page param + assert "page=0" in client._session.post.call_args[0][0] + + def test_search_cases_with_page(self, client): + client._session.post.return_value = _resp(json_data=_case_page(2, True)) + client.search_cases(CaseSearchCriteria(case_title="Apple"), page=2) + assert "page=2" in client._session.post.call_args[0][0] + + def test_search_parties(self, client): + client._session.post.return_value = _resp(json_data=PARTY_PAGE_LAST) + result = client.search_parties(PartySearchCriteria(last_name="Smith")) + assert result.content[0].full_name == "John Smith" + assert "/parties/find" in client._session.post.call_args[0][0] + + +# --------------------------------------------------------------------------- +# Pagination +# --------------------------------------------------------------------------- + + +class TestPagination: + def test_fetch_all_pages_stops_on_last(self, client): + pages = [ + _resp(json_data=_case_page(0, False)), + _resp(json_data=_case_page(1, False)), + _resp(json_data=_case_page(2, True)), + ] + client._session.post.side_effect = pages + results = client.search_cases_all_pages(CaseSearchCriteria(case_title="x")) + assert len(results) == 3 + assert results[-1].page_info.last is True + + def test_fetch_all_pages_respects_max_pages(self, client): + # never-last responses; max_pages caps the loop + client._session.post.return_value = _resp(json_data=_case_page(0, False)) + results = client.search_cases_all_pages( + CaseSearchCriteria(case_title="x"), max_pages=2 + ) + assert len(results) == 2 + + def test_search_parties_all_pages(self, client): + client._session.post.return_value = _resp(json_data=PARTY_PAGE_LAST) + results = client.search_parties_all_pages(PartySearchCriteria(last_name="Smith")) + assert len(results) == 1 + + +# --------------------------------------------------------------------------- +# Batch searches +# --------------------------------------------------------------------------- + + +class TestBatchSearches: + def test_start_batch_case_search(self, client): + client._session.post.return_value = _resp( + json_data={"reportId": 42, "status": "WAITING"} + ) + job = client.start_batch_case_search(CaseSearchCriteria(case_title="x")) + assert job.report_id == 42 + assert job.is_running is True + assert "/cases/download" in client._session.post.call_args[0][0] + + def test_start_batch_party_search(self, client): + client._session.post.return_value = _resp( + json_data={"reportId": 7, "status": "RUNNING"} + ) + job = client.start_batch_party_search(PartySearchCriteria(last_name="x")) + assert job.report_id == 7 + assert "/parties/download" in client._session.post.call_args[0][0] + + def test_get_batch_status(self, client): + client._session.get.return_value = _resp( + json_data={"reportId": 42, "status": "COMPLETED"} + ) + job = client.get_batch_status(42, search_type="cases") + assert job.is_complete is True + assert "/cases/download/status/42" in client._session.get.call_args[0][0] + + def test_list_batch_jobs(self, client): + client._session.get.return_value = _resp( + json_data={"content": [{"reportId": 1, "status": "COMPLETED"}]} + ) + resp = client.list_batch_jobs(search_type="parties") + assert len(resp.content) == 1 + assert "/parties/reports" in client._session.get.call_args[0][0] + + def test_download_batch_results_cases(self, client): + client._session.get.return_value = _resp(json_data=CASE_PAGE_LAST) + resp = client.download_batch_results(42, search_type="cases") + assert resp.content[0].court_id == "nysd" + assert "/cases/download/42" in client._session.get.call_args[0][0] + + def test_download_batch_results_parties(self, client): + client._session.get.return_value = _resp(json_data=PARTY_PAGE_LAST) + resp = client.download_batch_results(7, search_type="parties") + assert resp.content[0].last_name == "Smith" + + def test_delete_batch_job_success(self, client): + client._session.delete.return_value = _resp(status_code=204) + assert client.delete_batch_job(42) is True + + def test_delete_batch_job_not_deleted(self, client): + client._session.delete.return_value = _resp(status_code=200) + assert client.delete_batch_job(42) is False From 5defd3fd74bc3a7d1c2492104a1048458bb7baff Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Jun 2026 04:18:14 +0000 Subject: [PATCH 2/2] Add network-mocked tests for downloader.py (-> 96% coverage) Cover the CM/ECF docket/document downloaders with a mocked requests.Session and patched authenticate(): the pure helpers (extract_document_metadata, load_cached_documents, get_document_by_number, DownloadResult), DocketDownloader auth/helpers/_cso_login (including the MFA flow), download_docket_by_link (direct docket, report-config form, JS->CSO redirect, error/short/network paths), download_docket_by_case_number, and DocumentDownloader (auth gating, direct PDF, PDF-link/iframe/goDLS receipt/generic-form flows, HTML-instead-of-PDF and missing-magic-bytes warnings, login-redirect and network error paths). https://claude.ai/code/session_01NNvhsYRVWhjfcdgaSmU5bt --- tests/test_downloader.py | 874 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 874 insertions(+) create mode 100644 tests/test_downloader.py diff --git a/tests/test_downloader.py b/tests/test_downloader.py new file mode 100644 index 0000000..78d6b50 --- /dev/null +++ b/tests/test_downloader.py @@ -0,0 +1,874 @@ +"""Tests for the CM/ECF docket and document downloaders. + +All network access is mocked: the downloaders' ``session`` is replaced with a +MagicMock and ``pacer_cli.downloader.authenticate`` is patched so no real login +occurs. The pure helpers (metadata extraction, cache loading) need no network. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from pacer_cli.auth import AuthResult +from pacer_cli.config import PacerConfig +from pacer_cli.downloader import ( + DocketDownloader, + DocumentDownloader, + DownloadResult, + download_docket, + extract_document_metadata, + get_document_by_number, + load_cached_documents, +) + + +@pytest.fixture +def config() -> PacerConfig: + return PacerConfig(username="user", password="pass") + + +def _resp(status_code=200, text="", content=None, headers=None, url="https://ecf.nysd.uscourts.gov"): + resp = MagicMock(spec=requests.Response) + resp.status_code = status_code + resp.text = text + resp.content = content if content is not None else text.encode() + resp.headers = headers or {} + resp.url = url + resp.cookies = {} + if status_code >= 400: + resp.raise_for_status.side_effect = requests.HTTPError(response=resp) + else: + resp.raise_for_status.return_value = None + return resp + + +# Realistic docket HTML, long enough (>1000 chars) to pass the length check. +DOCKET_HTML = ( + "CM/ECF - nysd\n" + "

CASE #: 1:18-cv-08434-VEC-SLC

\n" + "\n" + "
\n" + "Apple Inc. v. Samsung Electronics Co.
\n" + "Assigned to: Judge Vernon S. Broderick
\n" + "Cause: 28:1332 Diversity\n" + "
\n" + "Date Filed: 09/15/2018
\n" + "
\n" + "\n" + "" + "\n" + "" + "\n" + "
09/15/20181COMPLAINT filed
09/20/20182MOTION to dismiss
\n" + ("\n" * 60) + "" +) + + +# --------------------------------------------------------------------------- +# DownloadResult dataclass +# --------------------------------------------------------------------------- + + +class TestDownloadResult: + def test_defaults(self): + r = DownloadResult(success=True) + assert r.success is True + assert r.filepath is None + assert r.docs_filepath is None + assert r.error is None + assert r.pages == 0 + assert r.cost == 0.0 + + def test_full(self): + r = DownloadResult( + success=False, filepath=Path("/tmp/x.html"), error="boom", pages=3, cost=0.3 + ) + assert r.error == "boom" + assert r.pages == 3 + assert r.cost == 0.3 + + +# --------------------------------------------------------------------------- +# extract_document_metadata +# --------------------------------------------------------------------------- + + +class TestExtractDocumentMetadata: + def test_extracts_documents(self): + meta = extract_document_metadata( + DOCKET_HTML, "https://ecf.nysd.uscourts.gov", case_number="500997", court_id="nysd" + ) + assert meta["case_number"] == "500997" + assert meta["court_id"] == "nysd" + assert meta["case_title"] == "Apple Inc. v. Samsung Electronics Co." + assert meta["document_count"] == 2 + urls = [d["url"] for d in meta["documents"]] + assert "https://ecf.nysd.uscourts.gov/doc1/123" in urls + assert "https://ecf.nysd.uscourts.gov/doc1/124" in urls + assert "downloaded_at" in meta and meta["downloaded_at"].endswith("Z") + + def test_falls_back_to_parsed_meta(self): + # No explicit case_number/court_id -> uses parsed docket meta. + meta = extract_document_metadata(DOCKET_HTML, "https://ecf.nysd.uscourts.gov") + assert meta["case_number"] == "1:18-cv-08434-VEC-SLC" + assert meta["court_id"] == "nysd" + + def test_parse_failure_returns_empty_manifest(self): + with patch("pacer_cli.parser.parse_docket", side_effect=ValueError("boom")): + meta = extract_document_metadata( + "", "https://ecf.x.uscourts.gov", case_number="1", court_id="x" + ) + assert meta["parse_error"] is True + assert meta["documents"] == [] + assert meta["case_number"] == "1" + + def test_entries_without_docnum_are_skipped(self): + # Third row has no link and no doc number -> excluded. + html = DOCKET_HTML.replace( + "\n", + "10/01/2018ORDER text\n\n", + 1, + ) + meta = extract_document_metadata(html, "https://ecf.nysd.uscourts.gov") + # Still only the two linked documents. + assert meta["document_count"] == 2 + + +# --------------------------------------------------------------------------- +# load_cached_documents / get_document_by_number +# --------------------------------------------------------------------------- + + +class TestCacheHelpers: + def _write_docs(self, case_dir: Path, payload: dict) -> None: + case_dir.mkdir(parents=True, exist_ok=True) + (case_dir / "docs.json").write_text(json.dumps(payload), encoding="utf-8") + + def test_load_missing_returns_none(self, tmp_path): + assert load_cached_documents(tmp_path) is None + + def test_load_valid(self, tmp_path): + payload = {"documents": [{"doc_num": "1"}]} + self._write_docs(tmp_path, payload) + assert load_cached_documents(tmp_path) == payload + + def test_load_corrupt_returns_none(self, tmp_path): + (tmp_path / "docs.json").write_text("{not json", encoding="utf-8") + assert load_cached_documents(tmp_path) is None + + def test_get_document_by_number_found(self, tmp_path): + self._write_docs( + tmp_path, + {"documents": [{"doc_num": "1", "url": "u1"}, {"doc_num": "2", "url": "u2"}]}, + ) + doc = get_document_by_number(tmp_path, "2") + assert doc["url"] == "u2" + + def test_get_document_by_number_not_found(self, tmp_path): + self._write_docs(tmp_path, {"documents": [{"doc_num": "1"}]}) + assert get_document_by_number(tmp_path, "99") is None + + def test_get_document_by_number_no_cache(self, tmp_path): + assert get_document_by_number(tmp_path, "1") is None + + +# --------------------------------------------------------------------------- +# DocketDownloader.authenticate +# --------------------------------------------------------------------------- + + +class TestDocketAuth: + def test_authenticate_success(self, config): + dl = DocketDownloader(config, verbose=True) + with patch("pacer_cli.downloader.authenticate") as m: + m.return_value = AuthResult(success=True, token="tok" + "x" * 100) + assert dl.authenticate() is True + assert dl.token.startswith("tok") + + def test_authenticate_failure(self, config): + dl = DocketDownloader(config) + with patch("pacer_cli.downloader.authenticate") as m: + m.return_value = AuthResult(success=False, error="bad creds") + assert dl.authenticate() is False + assert dl.token is None + + +# --------------------------------------------------------------------------- +# DocketDownloader._cso_login +# --------------------------------------------------------------------------- + + +@pytest.fixture +def config_mfa() -> PacerConfig: + return PacerConfig(username="user", password="pass", totp_secret="JBSWY3DPEHPK3PXP") + + +class TestCsoLogin: + def _dl(self, config): + dl = DocketDownloader(config, verbose=True) + dl.session = MagicMock(spec=requests.Session) + return dl + + def test_success_redirected_to_app(self, config): + dl = self._dl(config) + app_url = "https://ecf.nysd.uscourts.gov/cgi-bin/iquery.pl" + login_page = ( + 'name="javax.faces.ViewState" id="x" value="vs123" ' + "loginForm" + ) + dl.session.get.return_value = _resp(text=login_page) + dl.session.post.return_value = _resp(text="logged in", url=app_url) + assert dl._cso_login("NYSDC", app_url) is True + # ViewState carried into the form POST + post_data = dl.session.post.call_args.kwargs["data"] + assert post_data["javax.faces.ViewState"] == "vs123" + assert post_data["loginForm:loginName"] == "user" + + def test_jakarta_viewstate_variant(self, config): + dl = self._dl(config) + app_url = "https://ecf.nysd.uscourts.gov/app" + login_page = 'name="jakarta.faces.ViewState" value="vsJ"' + dl.session.get.return_value = _resp(text=login_page) + dl.session.post.return_value = _resp(text="DktRpt", url="https://x/DktRpt.pl") + assert dl._cso_login("NYSDC", app_url) is True + post_data = dl.session.post.call_args.kwargs["data"] + assert post_data["jakarta.faces.ViewState"] == "vsJ" + + def test_invalid_credentials(self, config): + dl = self._dl(config) + dl.session.get.return_value = _resp(text="login page") + # ends back on a login URL with an Invalid message + dl.session.post.return_value = _resp( + text="Invalid username or password error", + url="https://pacer.login.uscourts.gov/csologin/login.jsf", + ) + assert dl._cso_login("NYSDC", "https://ecf.nysd.uscourts.gov/app") is False + + def test_proceeds_with_session_cookies(self, config): + dl = self._dl(config) + dl.session.get.return_value = _resp(text="login") + post = _resp(text="neutral page", url="https://somewhere/else") + dl.session.post.return_value = post + dl.session.cookies = {"PacerUser": "v"} + assert dl._cso_login("NYSDC", "https://ecf.nysd.uscourts.gov/app") is True + + def test_uncertain_returns_true(self, config): + dl = self._dl(config) + dl.session.get.return_value = _resp(text="login") + dl.session.post.return_value = _resp( + text="neutral", url="https://somewhere/else" + ) + dl.session.cookies = {} + assert dl._cso_login("NYSDC", "https://ecf.nysd.uscourts.gov/app") is True + + def test_exception_returns_false(self, config): + dl = self._dl(config) + dl.session.get.side_effect = requests.ConnectionError("down") + assert dl._cso_login("NYSDC", "https://ecf.nysd.uscourts.gov/app") is False + + def test_mfa_flow_with_redirect(self, config_mfa): + dl = self._dl(config_mfa) + app_url = "https://ecf.nysd.uscourts.gov/app" + login_page = 'name="jakarta.faces.ViewState" value="vs1"' + # initial POST returns an mfaForm; MFA AJAX POST returns a redirect XML + mfa_page = 'mfaForm name="jakarta.faces.ViewState" value="vs2"' + mfa_redirect = '' # noqa: E501 + dl.session.get.side_effect = [ + _resp(text=login_page), # GET login page + _resp(text="final app", url=app_url), # GET redirect target + ] + dl.session.post.side_effect = [ + _resp(text=mfa_page, url="https://pacer.login.uscourts.gov/csologin/login.jsf"), + _resp(text=mfa_redirect), + ] + with patch("pacer_cli.auth.generate_totp", return_value="123456"), patch( + "pacer_cli.downloader.time.sleep" + ): + assert dl._cso_login("NYSDC", app_url) is True + # MFA AJAX POST included the OTP + mfa_data = dl.session.post.call_args_list[1].kwargs["data"] + assert mfa_data["mfaForm:mfaInput"] == "123456" + + +# --------------------------------------------------------------------------- +# DocketDownloader helpers +# --------------------------------------------------------------------------- + + +class TestDocketHelpers: + def test_get_case_id_from_link(self, config): + dl = DocketDownloader(config) + assert ( + dl._get_case_id_from_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997" + ) + == "500997" + ) + + def test_get_case_id_from_link_none(self, config): + dl = DocketDownloader(config) + assert dl._get_case_id_from_link("https://ecf.nysd.uscourts.gov/cgi-bin/x.pl") is None + + def test_get_ecf_base_url(self, config): + dl = DocketDownloader(config) + assert dl._get_ecf_base_url("nysdce") == "https://ecf.nysd.uscourts.gov" + + def test_get_ecf_base_url_bankruptcy(self, config): + dl = DocketDownloader(config) + # ends with 'bk' after stripping -> bankruptcy branch + assert dl._get_ecf_base_url("nybbk").startswith("https://ecf.") + + +# --------------------------------------------------------------------------- +# DocketDownloader.download_docket_by_link +# --------------------------------------------------------------------------- + + +class TestDownloadDocketByLink: + def _dl(self, config, token="tok" + "x" * 100): + dl = DocketDownloader(config) + dl.session = MagicMock(spec=requests.Session) + dl.token = token + return dl + + def test_auth_failure_returns_error(self, config, tmp_path): + dl = DocketDownloader(config) + dl.session = MagicMock(spec=requests.Session) + # token is None -> authenticate() runs and fails + with patch.object(dl, "authenticate", return_value=False): + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert result.error == "Authentication failed" + + def test_bad_case_link_no_id(self, config, tmp_path): + dl = self._dl(config) + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl", tmp_path + ) + assert result.success is False + assert "Could not extract case ID" in result.error + + def test_successful_direct_docket(self, config, tmp_path): + dl = self._dl(config) + # query menu returns a normal page, then DktRpt returns the full docket directly. + dl.session.get.side_effect = [ + _resp(text="query menu ok", url="https://ecf.nysd.uscourts.gov/x"), + _resp(text=DOCKET_HTML, url="https://ecf.nysd.uscourts.gov/cgi-bin/DktRpt.pl"), + ] + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is True + assert result.filepath.exists() + assert result.filepath.name == "nysd_500997.html" + assert result.docs_filepath.exists() + docs = json.loads(result.docs_filepath.read_text()) + assert docs["document_count"] == 2 + assert result.pages >= 1 + + def test_custom_filename(self, config, tmp_path): + dl = self._dl(config) + dl.session.get.side_effect = [ + _resp(text="ok"), + _resp(text=DOCKET_HTML), + ] + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", + tmp_path, + filename="custom.html", + ) + assert result.filepath.name == "custom.html" + + def test_report_config_form_submitted(self, config, tmp_path): + dl = self._dl(config) + form_page = ( + '
' + 'Sort by ' + '
' + ) + dl.session.get.side_effect = [ + _resp(text="query menu"), + _resp(text=form_page), # DktRpt returns config form + ] + dl.session.post.return_value = _resp(text=DOCKET_HTML) + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is True + # the hidden field 'hidtok' should be carried into POST data + post_data = dl.session.post.call_args.kwargs["data"] + assert post_data["hidtok"] == "abc" + assert post_data["all_case_ids"] == "500997" + + def test_form_without_action_uses_docket_url(self, config, tmp_path): + dl = self._dl(config) + form_page = "Sort by date_from no form action here" + dl.session.get.side_effect = [ + _resp(text="query menu"), + _resp(text=form_page), + ] + dl.session.post.return_value = _resp(text=DOCKET_HTML) + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is True + + def test_js_redirect_triggers_cso_login(self, config, tmp_path): + dl = self._dl(config) + js_page = ( + "" + ) + dl.session.get.side_effect = [ + _resp(text=js_page), # query menu -> JS redirect + _resp(text="after login ok"), # retry after login + _resp(text=DOCKET_HTML), # DktRpt + ] + with patch.object(dl, "_cso_login", return_value=True) as cso: + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + cso.assert_called_once() + assert cso.call_args[0][0] == "NYSDC" + assert result.success is True + + def test_js_redirect_cso_login_fails(self, config, tmp_path): + dl = self._dl(config) + js_page = ( + "location.assign(" + '"https://pacer.login.uscourts.gov/csologin/login.jsf?pscCourtId=NYSDC")' + "" + ) + dl.session.get.side_effect = [_resp(text=js_page)] + with patch.object(dl, "_cso_login", return_value=False): + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert result.error == "CSO login failed" + + def test_js_redirect_still_redirected_after_login(self, config, tmp_path): + dl = self._dl(config) + js_page = ( + "location.assign(" + '"https://pacer.login.uscourts.gov/csologin/login.jsf?pscCourtId=NYSDC")' + "" + ) + dl.session.get.side_effect = [ + _resp(text=js_page), # query menu + _resp(text=js_page), # still redirecting after login + ] + with patch.object(dl, "_cso_login", return_value=True): + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert "still being redirected" in result.error + + def test_401_from_query_menu(self, config, tmp_path): + dl = self._dl(config) + dl.session.get.side_effect = [_resp(status_code=401, text="denied")] + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert "token rejected" in result.error + + def test_login_page_url_detected(self, config, tmp_path): + dl = self._dl(config) + dl.session.get.side_effect = [ + _resp(text="ok", url="https://pacer.login.uscourts.gov/login.jsf") + ] + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert "Session expired" in result.error + + def test_network_error_on_query_menu(self, config, tmp_path): + dl = self._dl(config) + dl.session.get.side_effect = requests.ConnectionError("down") + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert "Network error" in result.error + + def test_short_response_is_invalid(self, config, tmp_path): + dl = self._dl(config) + dl.session.get.side_effect = [ + _resp(text="query menu ok padding to pass"), + _resp(text="too short"), # DktRpt response < 1000 chars + ] + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert "empty or invalid" in result.error + + def test_network_error_on_docket_request(self, config, tmp_path): + dl = self._dl(config) + dl.session.get.side_effect = [ + _resp(text="query menu ok"), + requests.ConnectionError("down"), + ] + result = dl.download_docket_by_link( + "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997", tmp_path + ) + assert result.success is False + assert "Network error" in result.error + + +# --------------------------------------------------------------------------- +# DocketDownloader.download_docket_by_case_number +# --------------------------------------------------------------------------- + + +class TestDownloadDocketByCaseNumber: + def test_case_not_found(self, config, tmp_path): + dl = DocketDownloader(config) + empty = MagicMock() + empty.content = [] + with patch("pacer_cli.pcl.PCLClient") as pcl_cls: + pcl_cls.return_value.search_cases.return_value = empty + result = dl.download_docket_by_case_number( + "1:2018cv08434", "nysdce", tmp_path + ) + assert result.success is False + assert "Case not found" in result.error + + def test_no_case_link(self, config, tmp_path): + dl = DocketDownloader(config) + case = MagicMock() + case.case_link = None + results = MagicMock() + results.content = [case] + with patch("pacer_cli.pcl.PCLClient") as pcl_cls: + pcl_cls.return_value.search_cases.return_value = results + result = dl.download_docket_by_case_number( + "1:2018cv08434", "nysdce", tmp_path + ) + assert result.success is False + assert "no caseLink" in result.error + + def test_found_delegates_to_by_link(self, config, tmp_path): + dl = DocketDownloader(config) + case = MagicMock() + case.case_link = "https://ecf.nysd.uscourts.gov/cgi-bin/iqquerymenu.pl?500997" + results = MagicMock() + results.content = [case] + sentinel = DownloadResult(success=True, filepath=tmp_path / "x.html") + with patch("pacer_cli.pcl.PCLClient"), patch.object( + dl, "download_docket_by_link", return_value=sentinel + ) as by_link: + result = dl.download_docket_by_case_number( + "1:2018cv08434", "nysdce", tmp_path + ) + assert result is sentinel + # default filename derived from case number (':' -> '+'), passed positionally + assert by_link.call_args[0][2] == "nysdce_1+2018cv08434.html" + + +# --------------------------------------------------------------------------- +# download_docket convenience function +# --------------------------------------------------------------------------- + + +class TestDownloadDocketFunction: + def test_delegates(self, config, tmp_path): + sentinel = DownloadResult(success=True) + with patch.object( + DocketDownloader, "download_docket_by_case_number", return_value=sentinel + ) as m: + result = download_docket(config, "1:2018cv08434", "nysdce", tmp_path) + assert result is sentinel + m.assert_called_once() + + +# --------------------------------------------------------------------------- +# DocumentDownloader helpers +# --------------------------------------------------------------------------- + + +class TestDocumentDownloaderHelpers: + def test_session_lazily_creates_docket_downloader(self, config): + dd = DocumentDownloader(config) + assert dd._docket_dl is None + sess = dd.session + assert dd._docket_dl is not None + assert sess is dd._docket_dl.session + + def test_is_valid_pdf(self, config): + dd = DocumentDownloader(config) + assert dd._is_valid_pdf(b"%PDF-1.7 stuff") is True + assert dd._is_valid_pdf(b"") is False + assert dd._is_valid_pdf(b"%PD") is False + + def test_get_court_from_url(self, config): + dd = DocumentDownloader(config) + assert dd._get_court_from_url("https://ecf.nysd.uscourts.gov/doc1/1") == "nysd" + + def test_is_login_redirect_by_url(self, config): + dd = DocumentDownloader(config) + resp = _resp(text="", url="https://x/csologin/login.jsf") + assert dd._is_login_redirect(resp) is True + + def test_is_login_redirect_by_body(self, config): + dd = DocumentDownloader(config) + resp = _resp(text='location.assign("https://x/csologin/login.jsf")', url="https://ok") + assert dd._is_login_redirect(resp) is True + + def test_is_login_redirect_false(self, config): + dd = DocumentDownloader(config) + resp = _resp(text="fine", url="https://ecf.nysd.uscourts.gov/doc1/1") + assert dd._is_login_redirect(resp) is False + + +# --------------------------------------------------------------------------- +# DocumentDownloader._ensure_authenticated +# --------------------------------------------------------------------------- + + +class TestEnsureAuthenticated: + def test_cached_court_short_circuits(self, config): + dd = DocumentDownloader(config) + dd.authenticated_courts.add("nysd") + assert dd._ensure_authenticated("https://ecf.nysd.uscourts.gov/doc1/1") is True + + def test_auth_failure(self, config): + dd = DocumentDownloader(config) + with patch.object(DocketDownloader, "authenticate", return_value=False): + assert dd._ensure_authenticated("https://ecf.nysd.uscourts.gov/doc1/1") is False + + def test_success_no_login_redirect(self, config): + dd = DocumentDownloader(config) + with patch.object(DocketDownloader, "authenticate", return_value=True): + dd._docket_dl = DocketDownloader(config) + dd._docket_dl.session = MagicMock(spec=requests.Session) + dd._docket_dl.session.get.return_value = _resp( + text="ok", url="https://ecf.nysd.uscourts.gov/cgi-bin/iquery.pl" + ) + assert dd._ensure_authenticated("https://ecf.nysd.uscourts.gov/doc1/1") is True + assert "nysd" in dd.authenticated_courts + + def test_cso_login_required_and_succeeds(self, config): + dd = DocumentDownloader(config) + dd._docket_dl = DocketDownloader(config) + dd._docket_dl.session = MagicMock(spec=requests.Session) + dd._docket_dl.session.get.return_value = _resp( + text="ok", url="https://x/csologin/login.jsf" + ) + with patch.object(DocketDownloader, "authenticate", return_value=True), patch.object( + dd._docket_dl, "_cso_login", return_value=True + ): + assert dd._ensure_authenticated("https://ecf.nysd.uscourts.gov/doc1/1") is True + + def test_cso_login_required_and_fails(self, config): + dd = DocumentDownloader(config) + dd._docket_dl = DocketDownloader(config) + dd._docket_dl.session = MagicMock(spec=requests.Session) + dd._docket_dl.session.get.return_value = _resp( + text="ok", url="https://x/csologin/login.jsf" + ) + with patch.object(DocketDownloader, "authenticate", return_value=True), patch.object( + dd._docket_dl, "_cso_login", return_value=False + ): + assert dd._ensure_authenticated("https://ecf.nysd.uscourts.gov/doc1/1") is False + + +# --------------------------------------------------------------------------- +# DocumentDownloader.download_document +# --------------------------------------------------------------------------- + + +class TestDownloadDocument: + def _dd(self, config): + dd = DocumentDownloader(config) + dd._docket_dl = DocketDownloader(config) + dd._docket_dl.session = MagicMock(spec=requests.Session) + dd.authenticated_courts.add("nysd") # skip the auth dance + return dd + + def test_auth_failure(self, config, tmp_path): + dd = DocumentDownloader(config) + with patch.object(dd, "_ensure_authenticated", return_value=False): + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/1", tmp_path) + assert result.success is False + assert result.error == "Authentication failed" + + def test_direct_pdf(self, config, tmp_path): + dd = self._dd(config) + pdf = b"%PDF-1.7" + b"x" * 6000 + dd.session.get.return_value = _resp( + content=pdf, headers={"content-type": "application/pdf"} + ) + result = dd.download_document( + "https://ecf.nysd.uscourts.gov/doc1/127133396215", tmp_path + ) + assert result.success is True + assert result.filepath.suffix == ".pdf" + assert result.filepath.name == "127133396215.pdf" + assert result.filepath.read_bytes() == pdf + assert result.pages >= 1 + assert result.cost > 0 + assert result.error is None + + def test_login_redirect_early(self, config, tmp_path): + dd = self._dd(config) + dd.session.get.return_value = _resp(text="x", url="https://x/csologin/login.jsf") + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/1", tmp_path) + assert result.success is False + assert "login required" in result.error + + def test_html_with_pdf_link_followed(self, config, tmp_path): + dd = self._dd(config) + html = 'View' + pdf = b"%PDF-1.7" + b"y" * 3000 + dd.session.get.side_effect = [ + _resp(text=html, headers={"content-type": "text/html"}), + _resp(content=pdf, headers={"content-type": "application/pdf"}), + ] + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/1", tmp_path) + assert result.success is True + assert result.filepath.read_bytes() == pdf + # the followed PDF link is absolutized + assert dd.session.get.call_args_list[1][0][0] == ( + "https://ecf.nysd.uscourts.gov/doc1/123.pdf" + ) + + def test_godls_receipt_flow(self, config, tmp_path): + dd = self._dd(config) + receipt = ( + "View Document " + "" + ) + iframe_page = '' + pdf = b"%PDF-1.7" + b"z" * 4000 + dd.session.get.side_effect = [ + _resp(text=receipt, headers={"content-type": "text/html"}), # initial + _resp(content=pdf, headers={"content-type": "application/pdf"}), # iframe pdf + ] + dd.session.post.return_value = _resp( + text=iframe_page, headers={"content-type": "text/html"} + ) + result = dd.download_document( + "https://ecf.nysd.uscourts.gov/doc1/127133396215", tmp_path + ) + assert result.success is True + assert result.filepath.read_bytes() == pdf + post_data = dd.session.post.call_args.kwargs["data"] + assert post_data["caseid"] == "500997" + assert post_data["de_seq_num"] == "42" + assert post_data["magic_num"] == "magic123" + + def test_iframe_followed(self, config, tmp_path): + dd = self._dd(config) + html = '' + pdf = b"%PDF-1.7" + b"q" * 3000 + dd.session.get.side_effect = [ + _resp(text=html, headers={"content-type": "text/html"}), + _resp(content=pdf, headers={"content-type": "application/pdf"}), + ] + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/9", tmp_path) + assert result.success is True + assert dd.session.get.call_args_list[1][0][0] == ( + "https://ecf.nysd.uscourts.gov/show.pl?doc=1" + ) + + def test_generic_form_submitted(self, config, tmp_path): + dd = self._dd(config) + html = ( + "
" + '' + "
" + ) + pdf = b"%PDF-1.7" + b"w" * 3000 + dd.session.get.return_value = _resp(text=html, headers={"content-type": "text/html"}) + dd.session.post.return_value = _resp( + content=pdf, headers={"content-type": "application/pdf"} + ) + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/3", tmp_path) + assert result.success is True + assert dd.session.post.call_args.kwargs["data"] == {"tok": "abc"} + + def test_html_instead_of_pdf_warning(self, config, tmp_path): + dd = self._dd(config) + html = "This document is sealed" + dd.session.get.return_value = _resp(text=html, headers={"content-type": "text/html"}) + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/5", tmp_path) + # Success (file written) but with a warning in the error field. + assert result.success is True + assert result.filepath.suffix == ".html" + assert "Got HTML instead of PDF" in result.error + assert result.pages == 0 + + def test_pdf_content_type_missing_magic_bytes(self, config, tmp_path): + dd = self._dd(config) + # content-type says pdf, but bytes are not a real PDF + dd.session.get.return_value = _resp( + content=b"not a pdf body", headers={"content-type": "application/pdf"} + ) + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/6", tmp_path) + assert result.success is True + assert result.filepath.suffix == ".pdf" + assert "missing magic bytes" in result.error + assert result.pages == 0 # not counted because not valid PDF + + def test_binary_fallback_extension(self, config, tmp_path): + dd = self._dd(config) + dd.session.get.return_value = _resp( + content=b"\x00\x01\x02data", headers={"content-type": "application/octet-stream"} + ) + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/8", tmp_path) + assert result.success is True + assert result.filepath.suffix == ".bin" + + def test_custom_filename(self, config, tmp_path): + dd = self._dd(config) + pdf = b"%PDF-1.7" + b"x" * 3000 + dd.session.get.return_value = _resp( + content=pdf, headers={"content-type": "application/pdf"} + ) + result = dd.download_document( + "https://ecf.nysd.uscourts.gov/doc1/1", tmp_path, filename="mydoc.pdf" + ) + assert result.filepath.name == "mydoc.pdf" + + def test_no_doc_id_in_url_uses_default_name(self, config, tmp_path): + dd = self._dd(config) + pdf = b"%PDF-1.7" + b"x" * 3000 + dd.session.get.return_value = _resp( + content=pdf, headers={"content-type": "application/pdf"} + ) + # URL without /doc1/ -> filename uses "document" + result = dd.download_document("https://ecf.nysd.uscourts.gov/other", tmp_path) + assert result.filepath.name == "document.pdf" + + def test_login_redirect_after_following_links(self, config, tmp_path): + dd = self._dd(config) + # HTML with a PDF link, but the followed link is a login redirect. + html = 'x' + dd.session.get.side_effect = [ + _resp(text=html, headers={"content-type": "text/html"}), + _resp(text="x", url="https://x/csologin/login.jsf"), + ] + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/1", tmp_path) + assert result.success is False + assert "Session expired during document fetch" in result.error + + def test_network_error(self, config, tmp_path): + dd = self._dd(config) + dd.session.get.side_effect = requests.ConnectionError("down") + result = dd.download_document("https://ecf.nysd.uscourts.gov/doc1/1", tmp_path) + assert result.success is False + assert "Network error" in result.error