""" Anti-bot protection module Implements various techniques to avoid detection by anti-crawling systems """ import random import logging from typing import Optional, Dict, Any, List import httpx from app.config import settings logger = logging.getLogger(__name__) # Predefined User-Agent list for rotation USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", ] class AntiBotProtection: """Anti-bot protection service""" def __init__(self): self.proxy_pool_url = settings.PROXY_POOL_URL self.random_delay_min = settings.RANDOM_DELAY_MIN self.random_delay_max = settings.RANDOM_DELAY_MAX def get_random_delay(self) -> float: """ Generate random delay within configured range. Returns delay in seconds. Validates: Requirements 7.1 """ delay = random.uniform(self.random_delay_min, self.random_delay_max) logger.debug(f"Generated random delay: {delay:.2f}s") return delay def get_random_user_agent(self) -> str: """ Select random User-Agent from predefined list. Returns User-Agent string. Validates: Requirements 7.2 """ user_agent = random.choice(USER_AGENTS) logger.debug(f"Selected User-Agent: {user_agent[:50]}...") return user_agent async def get_proxy(self) -> Optional[Dict[str, str]]: """ Get proxy from proxy pool service. Returns proxy dict or None if unavailable. Falls back to direct connection if proxy pool is unavailable. Validates: Requirements 7.3, 7.4 """ try: async with httpx.AsyncClient(timeout=5.0) as client: response = await client.get(f"{self.proxy_pool_url}/get") if response.status_code == 200: proxy_info = response.json() proxy_url = proxy_info.get("proxy") if proxy_url: proxy_dict = { "http://": f"http://{proxy_url}", "https://": f"https://{proxy_url}" } logger.info(f"Obtained proxy: {proxy_url}") return proxy_dict else: logger.warning("Proxy pool returned empty proxy") return None else: logger.warning(f"Proxy pool returned status {response.status_code}") return None except httpx.RequestError as e: logger.warning(f"Proxy pool service unavailable: {e}, falling back to direct connection") return None except Exception as e: logger.error(f"Error getting proxy: {e}") return None def build_headers(self, user_agent: Optional[str] = None) -> Dict[str, str]: """ Build HTTP headers with random User-Agent and common headers. Args: user_agent: Optional custom User-Agent, otherwise random one is selected Returns: Dict of HTTP headers """ if user_agent is None: user_agent = self.get_random_user_agent() headers = { "User-Agent": user_agent, "Accept": "application/json, text/plain, */*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Referer": "https://weibo.com/", "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", } return headers def get_fingerprint_data(self) -> Dict[str, Any]: """ Generate browser fingerprint data for simulation. Returns: Dict containing fingerprint information """ screen_resolutions = [ "1920x1080", "1366x768", "1440x900", "1536x864", "1280x720", "2560x1440", "3840x2160" ] timezones = [ "Asia/Shanghai", "Asia/Beijing", "Asia/Hong_Kong", "Asia/Taipei", "Asia/Singapore" ] languages = [ "zh-CN", "zh-CN,zh;q=0.9", "zh-CN,zh;q=0.9,en;q=0.8", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7" ] fingerprint = { "screen_resolution": random.choice(screen_resolutions), "timezone": random.choice(timezones), "language": random.choice(languages), "color_depth": random.choice([24, 32]), "platform": random.choice(["Win32", "MacIntel", "Linux x86_64"]), "hardware_concurrency": random.choice([4, 8, 12, 16]), "device_memory": random.choice([4, 8, 16, 32]), } logger.debug(f"Generated fingerprint: {fingerprint}") return fingerprint # Global instance antibot = AntiBotProtection()