diff --git a/.gitignore b/.gitignore index be06511..e37ec16 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,11 @@ -# created by virtualenv automatically .secret.txt +result +logs +config.toml +cookies.json +state.json +venv +__pycache__ +*.bak +database.db +december.txt diff --git a/AvitoParser.py b/AvitoParser.py index 02d4dbb..6c0e7b8 100644 --- a/AvitoParser.py +++ b/AvitoParser.py @@ -1,6 +1,7 @@ import threading import time from pathlib import Path +import prompt_user_login import flet as ft import tkinter as tk @@ -76,6 +77,7 @@ def set_up(): parse_views.value = config.parse_views save_xlsx.value = config.save_xlsx use_webdriver.value = config.use_webdriver + playwright_state_file.value = config.playwright_state_file page.update() @@ -111,6 +113,7 @@ def save_config(): "parse_views": parse_views.value, "save_xlsx": save_xlsx.value, "use_webdriver": use_webdriver.value, + "playwright_state_file": playwright_state_file.value, }} save_avito_config(config) @@ -165,6 +168,9 @@ def open_dlg_modal(e): dlg_modal_proxy.open = True page.update() + async def btn_prompt_user_login_handler(e): + await prompt_user_login.wrapper() + def start_parser(e): nonlocal is_run result = check_string() @@ -179,6 +185,10 @@ def start_parser(e): stop_btn.visible = True is_run = True page.update() + + threading.Thread(target=run_parser, daemon=True).start() + + def run_parser(): while is_run and not stop_event.is_set(): run_process() if not is_run: @@ -290,14 +300,14 @@ def run_process(): tooltip=TG_TOKEN_HELP) tg_chat_id = ft.TextField(label="Chat id telegram. Можно несколько через Enter", width=400, multiline=True, expand=True, text_size=12, height=70, tooltip=TG_CHAT_ID_HELP) - btn_test_tg = ft.ElevatedButton(text="Проверить tg", disabled=False, on_click=telegram_log_test, expand=True, + btn_test_tg = ft.Button("Проверить tg", disabled=False, on_click=telegram_log_test, expand=True, tooltip=BTN_TEST_TG_HELP) proxy = ft.TextField(label="Прокси в формате username:password@mproxy.site:port", width=400, expand=True, tooltip=PROXY_HELP) proxy_change_ip = ft.TextField( label="Ссылка для изменения IP, в формате https://changeip.mobileproxy.space/?proxy_key=***", width=400, expand=True, tooltip=PROXY_CHANGE_IP_HELP) - proxy_btn_help = ft.ElevatedButton(text="Подробнее про прокси", on_click=open_dlg_modal, expand=True, + proxy_btn_help = ft.Button("Подробнее про прокси", on_click=open_dlg_modal, expand=True, tooltip=PROXY_BTN_HELP_HELP) geo = ft.TextField(label="Ограничение по городу", width=400, expand=True, text_size=12, height=30, tooltip=GEO_HELP) @@ -313,16 +323,16 @@ def run_process(): ) start_btn = ft.FilledButton("Старт", width=800, on_click=start_parser, expand=True) stop_btn = ft.OutlinedButton("Стоп", width=980, on_click=stop_parser, visible=False, - style=ft.ButtonStyle(bgcolor=ft.colors.RED_400), expand=True) - console_widget = ft.Text(width=800, height=60, color=ft.colors.GREEN, value="", selectable=True, + style=ft.ButtonStyle(bgcolor=ft.Colors.RED_400), expand=True) + console_widget = ft.Text(width=800, height=60, color=ft.Colors.GREEN, value="", selectable=True, expand=True) buy_me_coffe_btn = ft.TextButton("Продвинуть разработку", on_click=lambda e: page.launch_url(DONAT_LINK), - style=ft.ButtonStyle(color=ft.colors.GREEN_300), expand=True, + style=ft.ButtonStyle(color=ft.Colors.GREEN_300), expand=True, tooltip=BUY_ME_COFFE_BTN_HELP) report_issue_btn = ft.TextButton("Сообщить о проблеме", on_click=lambda e: page.launch_url( - "https://github.com/Duff89/parser_avito/issues"), style=ft.ButtonStyle(color=ft.colors.GREY), expand=True, + "https://github.com/Duff89/parser_avito/issues"), style=ft.ButtonStyle(color=ft.Colors.GREY), expand=True, tooltip=REPORT_ISSUE_BTN_HELP) ignore_ads_in_reserv = ft.Checkbox(label="Игнор-ть резервы", value=True, tooltip=IGNORE_RESERV_HELP) ignore_promote_ads = ft.Checkbox(label="Игнор-ть продвинутые", value=False) @@ -336,7 +346,10 @@ def run_process(): use_webdriver = ft.Checkbox(label="Использовать браузер", value=True, tooltip=USE_WEBDRIVER_HELP) - + playwright_state_file = ft.TextField(label="Файл сессии Авито", width=400, expand=True, text_size=12, height=50, + tooltip=PLAYWRIGHT_STATE_FILE_HELP) + btn_prompt_user_login = ft.Button("Войти в аккаунт Авито", on_click=btn_prompt_user_login_handler, expand=True, + tooltip=PROMPT_USER_LOGIN_HELP) input_fields = ft.Column( [ @@ -370,7 +383,12 @@ def run_process(): alignment=ft.MainAxisAlignment.CENTER, spacing=0 ), - seller_black_list, + ft.Row( + [seller_black_list, playwright_state_file], + alignment=ft.MainAxisAlignment.CENTER, + spacing=0 + ), + btn_prompt_user_login, ft.Row( [tg_token, tg_chat_id], alignment=ft.MainAxisAlignment.CENTER, @@ -431,7 +449,4 @@ def start_page(): logger_console_init() -ft.app( - target=main, - assets_dir="assets", -) +ft.run(main, assets_dir="assets") diff --git a/README.md b/README.md index 6d39723..7fa7be1 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,9 @@ python parser_cls.py ``` +### Использование аккаунта Авито +Для обхода капчи и ограничений по количеству запросов можно подвязать к парсеру свой аккаунт. Категорически **НЕ** рекомендуется использовать свой основоной аккаунт. Для привязки аккаунта прописываем в конфиге `playwright_state_file = "state.json"`, после чего запускаем на системе с графическим интерфейсом `python prompt_user_login.py` и авторизуемся. Далее запускаем парсер, он автоматически подтянет сессию из `playwright_state_file` и будет обновлять её в файле по мере отправки запросов. + ## Работа через Docker Перед использованием настройте `config.toml` diff --git a/config.toml b/config.toml index fbad744..98897d7 100644 --- a/config.toml +++ b/config.toml @@ -27,3 +27,4 @@ one_file_for_link = false parse_views = false save_xlsx = true use_webdriver = true +playwright_state_file = "state.json" diff --git a/dto.py b/dto.py index 75fe121..ccfcd8c 100644 --- a/dto.py +++ b/dto.py @@ -42,4 +42,5 @@ class AvitoConfig: parse_views: bool = False save_xlsx: bool = True use_webdriver: bool = True + playwright_state_file: str | None = None diff --git a/get_cookies.py b/get_cookies.py index 7f05a40..e51e1d6 100644 --- a/get_cookies.py +++ b/get_cookies.py @@ -5,9 +5,11 @@ from playwright.async_api import async_playwright from playwright_stealth import Stealth from typing import Optional, Dict, List +from playwright.async_api import Error, TimeoutError from dto import Proxy, ProxySplit from playwright_setup import ensure_playwright_installed +from load_config import load_avito_config MAX_RETRIES = 3 RETRY_DELAY = 10 @@ -77,6 +79,12 @@ def parse_cookie_string(cookie_str: str) -> dict: async def launch_browser(self): ensure_playwright_installed("chromium") + + try: + config = load_avito_config("config.toml") + except Exception as err: + logger.error(f"Ошибка загрузки конфига: {err}") + stealth = Stealth() self.playwright_context = stealth.use_async(async_playwright()) playwright = await self.playwright_context.__aenter__() @@ -105,6 +113,12 @@ async def launch_browser(self): "has_touch": False, } + if isinstance(config.playwright_state_file,str): + context_args["storage_state"] = config.playwright_state_file + logger.debug(f"Используем Playwright state file {config.playwright_state_file}") + else: + logger.debug("Playwright state file не задан. Используем пустой контекст Playwright.") + if self.proxy_split_obj: context_args["proxy"] = { "server": self.proxy_split_obj.ip_port, diff --git a/lang.py b/lang.py index 52c2efd..0c1c5f6 100644 --- a/lang.py +++ b/lang.py @@ -55,3 +55,6 @@ SAVE_XLSX_HELP = "Сохранять результат в Excel файл?" USE_WEBDRIVER_HELP = ("Использовать эмулятор браузера или нет. Если запускаете на сервере, где мало оперативной памяти" " и работать будет долго - полезнее будет отключить, стабильность повысится") +PLAYWRIGHT_STATE_FILE_HELP = ("Не меняйте это значение если не знаете что делаете. В этот файл будет сохраняться сессия " \ + "пользователя Авито после входа в аккаунт") +PROMPT_USER_LOGIN_HELP = ("Войти а аккаунт Авито для обхода ошибки 429. Категорически НЕ рекомендуется использовать свой основной аккаунт") diff --git a/parser_cls.py b/parser_cls.py index 2d8fa85..dfaadfa 100644 --- a/parser_cls.py +++ b/parser_cls.py @@ -12,10 +12,14 @@ from loguru import logger from pydantic import ValidationError from requests.cookies import RequestsCookieJar +from playwright_setup import ensure_playwright_installed +from playwright.async_api import async_playwright, Playwright +from playwright.async_api import Error, TimeoutError +from pathlib import Path from common_data import HEADERS from db_service import SQLiteDBHandler -from dto import Proxy, AvitoConfig +from dto import Proxy, ProxySplit, AvitoConfig from get_cookies import get_cookies from hide_private_data import log_config from load_config import load_avito_config @@ -37,6 +41,7 @@ def __init__( ): self.config = config self.proxy_obj = self.get_proxy_obj() + self.proxy_split_obj = self.get_proxy_split_obj() self.db_handler = SQLiteDBHandler() self.tg_handler = self.get_tg_handler() self.xlsx_handler = XLSXHandler(self.__get_file_title()) @@ -67,6 +72,47 @@ def get_proxy_obj(self) -> Proxy | None: logger.info("Работаем без прокси") return None + @staticmethod + def check_protocol(ip_port: str) -> str: + if "http://" not in ip_port: + return f"http://{ip_port}" + return ip_port + + @staticmethod + def del_protocol(proxy_string: str): + if "//" in proxy_string: + return proxy_string.split("//")[1] + return proxy_string + + def get_proxy_split_obj(self) -> ProxySplit | None: + if not self.proxy_obj: + return None + try: + self.proxy_obj.proxy_string = self.del_protocol(proxy_string=self.proxy_obj.proxy_string) + if "@" in self.proxy_obj.proxy_string: + ip_port, user_pass = self.proxy_obj.proxy_string.split("@") + if "." in user_pass: + ip_port, user_pass = user_pass, ip_port + login, password = str(user_pass).split(":") + else: + login, password, ip, port = self.proxy_obj.proxy_string.split(":") + if "." in login: + login, password, ip, port = ip, port, login, password + ip_port = f"{ip}:{port}" + + ip_port = self.check_protocol(ip_port=ip_port) + + return ProxySplit( + ip_port=ip_port, + login=login, + password=password, + change_ip_link=self.proxy_obj.change_ip_link + ) + except Exception as err: + logger.error(err) + logger.critical("Прокси в таком формате не поддерживаются. " + "Используй: ip:port@user:pass или ip:port:user:pass") + def get_cookies(self, max_retries: int = 1, delay: float = 2.0) -> dict | None: if not self.config.use_webdriver: return @@ -165,12 +211,12 @@ def parse(self): for i in range(0, self.config.count): if self.stop_event and self.stop_event.is_set(): return - if DEBUG_MODE: - html_code = open("december.txt", "r", encoding="utf-8").read() - else: - html_code = self.fetch_data(url=url, retries=self.config.max_count_of_retry) - - if not html_code: + try: + if DEBUG_MODE: + html_code = open("december.txt", "r", encoding="utf-8").read() + else: + html_code = asyncio.run(self.get_html(url=url, headless=True)) + except Error as err: logger.warning( f"Не удалось получить HTML для {url}, пробую заново через {self.config.pause_between_links} сек.") time.sleep(self.config.pause_between_links) @@ -381,8 +427,9 @@ def parse_views(self, ads: list[Item]) -> list[Item]: for ad in ads: try: - html_code_full_page = self.fetch_data(url=f"https://www.avito.ru{ad.urlPath}") + html_code_full_page = asyncio.run(self.get_html(url=f"https://www.avito.ru{ad.urlPath}", headless=True)) ad.total_views, ad.today_views = self._extract_views(html=html_code_full_page) + logger.debug(f"Получены просмотры для {ad.id}") delay = random.uniform(0.1, 0.9) time.sleep(delay) except Exception as err: @@ -482,6 +529,103 @@ def get_next_page_url(self, url: str): except Exception as err: logger.error(f"Не смог сформировать ссылку на следующую страницу для {url}. Ошибка: {err}") + def is_avito_account_logged_in(self) -> bool: + if isinstance(self.config.playwright_state_file,str): + try: + with open(self.config.playwright_state_file, "r") as f: + state_file = json.load(f) + cookies_list = state_file["cookies"] + for cookie in cookies_list: + # sessid contains avito account session and should be present only after logging in + if cookie["name"] == "sessid": + return True + except: + logger.warning(f"Не удалось загрузить JSON из Playwright state file: {self.config.playwright_state_file}") + return False + else: + return False + + + async def get_html(self, url: str = None, headless: bool = True): + async with async_playwright() as playwright: + ensure_playwright_installed("chromium") + launch_args = { + "headless": headless, + "chromium_sandbox": False, + "args": [ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-dev-shm-usage", + "--start-maximized", + "--window-size=1920,1080", + ] + } + context_args = { + "viewport": {"width": 1920, "height": 1080}, + "screen": {"width": 1920, "height": 1080}, + "device_scale_factor": 1, + "is_mobile": False, + "has_touch": False, + } + if isinstance(self.config.playwright_state_file,str) and self.config.playwright_state_file != "": + context_args["storage_state"] = self.config.playwright_state_file + logger.debug(f"Используем Playwright state file {self.config.playwright_state_file}") + if self.is_avito_account_logged_in(): + logger.info(f"Используем аккаунт Авито") + else: + logger.warning(f"Аккаунт Авито не обнаружен, хотя настроен Playwright state file: {self.config.playwright_state_file}. Войдите в аккаунт через prompt_user_login.py или кнопку \"Войти в аккаунт Авито\"") + else: + logger.debug("Playwright state file не задан. Используем пустой контекст Playwright.") + + if self.proxy_split_obj: + context_args["proxy"] = { + "server": self.proxy_split_obj.ip_port, + "username": self.proxy_split_obj.login, + "password": self.proxy_split_obj.password + } + + try: + chromium = playwright.chromium + browser = await chromium.launch(**launch_args) + context = await browser.new_context(**context_args) + page = await context.new_page() + response = await page.goto(url=url, + timeout=60_000, + wait_until="domcontentloaded") + if response.status in [302, 403, 429]: + self.bad_request_count += 1 + self.change_ip() + raise requests.RequestsError(f"Слишком много запросов: {response.status}. Включите прокси либо войдите в аккаунт Авито") + elif response.status >= 500: + self.bad_request_count += 1 + raise requests.RequestsError(f"Ошибка сервера: {response.status}") + elif response.status >= 400: + self.bad_request_count += 1 + raise requests.RequestsError(f"Ошибка клиента: {response.status}") + + except Error as err: + logger.error(err.message) + self.bad_request_count += 1 + await page.close() + await context.close() + await browser.close() + return + + if isinstance(self.config.playwright_state_file,str): + try: + state_file = self.config.playwright_state_file + state_filepath = Path(state_file) + state_filepath.touch(mode=0o600, exist_ok=True) # Set mode to protect sensitive cookies + storage = await context.storage_state(path=state_filepath) + logger.debug(f"Playwright state сохранён в {state_file}") + except: + logger.error(f"Не удалось записать сессию в файл {state_file}") + + if await page.content() is not None: + return await page.content() + else: + logger.warning("Не удалось получить HTML") + return {} if __name__ == "__main__": try: diff --git a/playwright_setup.py b/playwright_setup.py index 241aa10..640a860 100644 --- a/playwright_setup.py +++ b/playwright_setup.py @@ -1,6 +1,7 @@ import subprocess import sys import os +import platform from loguru import logger @@ -11,9 +12,24 @@ def ensure_playwright_installed(browser: str = "chromium"): """ try: # === Указываем правильный путь к браузерам === - ms_playwright_dir = os.path.join( - os.path.expanduser("~"), "AppData", "Local", "ms-playwright" - ) + if platform.system() == 'Windows': + ms_playwright_dir = os.path.join( + os.path.expanduser("~"), "AppData", "Local", "ms-playwright" + ) + elif platform.system() == 'Linux': + ms_playwright_dir = os.path.join( + os.path.expanduser("~"), ".cache", "ms-playwright" + ) + elif platform.system() == 'Darwin': + ms_playwright_dir = os.path.join( + os.path.expanduser("~"), "Library", "Caches", "ms-playwright" + ) + else: + # Используем виндовый путь если не удалось определить ОС для гарантированной инициализации ms_playwright_dir + ms_playwright_dir = os.path.join( + os.path.expanduser("~"), "AppData", "Local", "ms-playwright" + ) + os.environ["PLAYWRIGHT_BROWSERS_PATH"] = ms_playwright_dir from playwright._impl._driver import compute_driver_executable @@ -24,7 +40,7 @@ def ensure_playwright_installed(browser: str = "chromium"): else: driver_path = result - browsers_exist = os.path.exists(driver_path) or os.path.exists(ms_playwright_dir) + browsers_exist = os.path.exists(driver_path) and os.path.exists(ms_playwright_dir) if not browsers_exist: logger.info(f"Playwright не найден. Устанавливаю {browser}...") diff --git a/prompt_user_login.py b/prompt_user_login.py new file mode 100644 index 0000000..ccf40aa --- /dev/null +++ b/prompt_user_login.py @@ -0,0 +1,77 @@ +import asyncio +from playwright.async_api import async_playwright, Playwright +from loguru import logger +from playwright_setup import ensure_playwright_installed +from pathlib import Path + +from load_config import load_avito_config + +logger.add("logs/app.log", rotation="5 MB", retention="5 days", level="DEBUG") + +async def prompt_user_login(playwright: Playwright): + try: + config = load_avito_config("config.toml") + except Exception as err: + logger.error(f"Ошибка загрузки конфига: {err}") + + if isinstance(config.playwright_state_file,str): + logger.info(f"Сессия будет сохранена в Playwright state file {config.playwright_state_file}") + else: + logger.error("Playwright state file не задан. Сессия не будет сохранена") + return + + ensure_playwright_installed("chromium") + chromium = playwright.chromium + launch_args = { + "headless": False, + "chromium_sandbox": False, + "args": [ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-dev-shm-usage", + "--start-maximized", + ] + } + context_args = { + "is_mobile": False, + "has_touch": False, + "locale": "ru-RU", + "no_viewport": True, + } + try: + browser = await chromium.launch(**launch_args) + context = await browser.new_context(**context_args) + page = await context.new_page() + except: + logger.error("Не удалось запустить графический браузер") + return + + # Waiting for 2fa response because it's enabled on all Avito accounts + try: + async with page.expect_response(url_or_predicate="https://www.avito.ru/web/2/tfa/auth", timeout=0) as response_info: + await page.goto(url="https://www.avito.ru/#login?authsrc=h", timeout=0) + # Reloading the page sometimes helps to bypass captcha (!) + if "Доступ ограничен" in await page.title(): + await page.reload() + + except: + logger.error("Браузер неожиданно закрыт. Сессия не будет сохранена") + return + + + try: + state_file = config.playwright_state_file + state_filepath = Path(state_file) + state_filepath.touch(mode=0o600, exist_ok=True) # Set mode to protect sensitive cookies + storage = await context.storage_state(path=state_filepath) + await context.close() + logger.info(f"Сессия пользователя Авито сохранена в {state_file}") + except: + logger.error(f"Не удалось записать сессию в файл {state_file}") + +async def wrapper(): + async with async_playwright() as playwright: + await prompt_user_login(playwright) + +if __name__ == "__main__": + asyncio.run(wrapper()) diff --git a/requirements.txt b/requirements.txt index 728bbe0..b876d6a 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/state.json b/state.json new file mode 100644 index 0000000..0dfaa26 --- /dev/null +++ b/state.json @@ -0,0 +1 @@ +{"cookies": [], "origins": []} \ No newline at end of file