import asyncio import logging import re from typing import List, Dict import requests from bs4 import BeautifulSoup logger = logging.getLogger(__name__) HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', } SKIP_DOMAINS = { "iciba.com", "baike.baidu.com", "cambridge.org", "dictionary.cambridge.org", "collinsdictionary.com", "dictionary.com", "merriam-webster.com", "thesaurus.com", "britannica.com", "wikipedia.org", "wikihow.com", "facebook.com", "twitter.com", "instagram.com", "youtube.com", "reddit.com", "pinterest.com", "amazon.com", "ebay.com", "walmart.com", "target.com", "bestbuy.com", "homedepot.com", "linkedin.com", "bing.com", "google.com", } SKIP_TITLE_PATTERNS = [ r'^是什么意思$', r'^翻译$', r'^词典$', r'^字典$', r'翻译$', r'^百度百科', r'^维基百科', ] def _is_junk(item: Dict[str, str]) -> bool: url = item.get("url", "") title = item.get("title", "") hostname = url.replace("https://", "").replace("http://", "").split("/")[0] if any(d in hostname for d in SKIP_DOMAINS): return True if any(d in url for d in SKIP_DOMAINS): return True for p in SKIP_TITLE_PATTERNS: if re.search(p, title): return True if hostname.endswith(".edu") or hostname.endswith(".ac") or hostname.endswith(".gov"): return True return False def _search_bing(query: str, count: int = 6) -> List[Dict[str, str]]: try: is_cjk = bool(re.search(r'[\u4e00-\u9fff]', query)) params = {"q": query, "count": count} if not is_cjk: params.update({"setlang": "en-US", "cc": "US"}) url = "https://www.bing.com/search" resp = requests.get(url, params=params, headers=HEADERS, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") results = [] seen = set() for li in soup.select("li.b_algo"): a = li.select_one("h2 a") if not a: continue href = a.get("href", "") if not href.startswith("http") or href in seen: continue seen.add(href) title = a.get_text(strip=True)[:120] snippet_el = li.select_one(".b_caption p, .b_lineclamp2") snippet = snippet_el.get_text(strip=True)[:300] if snippet_el else "" entry = {"title": title, "url": href, "snippet": snippet, "engine": "bing"} if not _is_junk(entry): results.append(entry) if len(results) >= count: break return results except Exception as e: logger.warning(f"Bing search failed: {e}") return [] def _search_360(query: str, count: int = 6) -> List[Dict[str, str]]: try: resp = requests.get("https://www.so.com/s", params={"q": query}, headers=HEADERS, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") results = [] seen = set() for li in soup.select(".result-list li, .result"): a = li.select_one("h3 a") if not a: continue href = a.get("href", "") if not href or href in seen: continue seen.add(href) title = a.get_text(strip=True)[:120] snippet_el = li.select_one(".masonry-text, .res-desc") snippet = snippet_el.get_text(strip=True)[:300] if snippet_el else "" entry = {"title": title, "url": href, "snippet": snippet, "engine": "360"} if not _is_junk(entry): results.append(entry) if len(results) >= count: break return results except Exception as e: logger.warning(f"360 search failed: {e}") return [] async def search_bing_batch(queries: List[str], max_per_query: int = 6) -> List[Dict[str, str]]: all_results = [] seen_urls = set() for query in queries: loop = asyncio.get_running_loop() bing_task = loop.run_in_executor(None, _search_bing, query, max_per_query) so_task = loop.run_in_executor(None, _search_360, query, max_per_query) bing_results, so_results = await asyncio.gather(bing_task, so_task) for entry in bing_results + so_results: url = entry["url"].rstrip("/") if url not in seen_urls: seen_urls.add(url) all_results.append(entry) return all_results async def search_bing(query: str, max_results: int = 10) -> List[Dict[str, str]]: return await search_bing_batch([query], max_per_query=max_results)