trade-assistant/backend/app/services/mcp_search_server.py

import asyncio
import logging
import re
from typing import List, Dict
import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
}

SKIP_DOMAINS = {
    "iciba.com", "baike.baidu.com", "cambridge.org", "dictionary.cambridge.org",
    "collinsdictionary.com", "dictionary.com", "merriam-webster.com",
    "thesaurus.com", "britannica.com", "wikipedia.org", "wikihow.com",
    "facebook.com", "twitter.com", "instagram.com", "youtube.com",
    "reddit.com", "pinterest.com", "amazon.com", "ebay.com",
    "walmart.com", "target.com", "bestbuy.com", "homedepot.com",
    "linkedin.com", "bing.com", "google.com",
}
SKIP_TITLE_PATTERNS = [
    r'^是什么意思$', r'^翻译$', r'^词典$', r'^字典$',
    r'翻译$', r'^百度百科', r'^维基百科',
]


def _is_junk(item: Dict[str, str]) -> bool:
    url = item.get("url", "")
    title = item.get("title", "")
    hostname = url.replace("https://", "").replace("http://", "").split("/")[0]
    if any(d in hostname for d in SKIP_DOMAINS):
        return True
    if any(d in url for d in SKIP_DOMAINS):
        return True
    for p in SKIP_TITLE_PATTERNS:
        if re.search(p, title):
            return True
    if hostname.endswith(".edu") or hostname.endswith(".ac") or hostname.endswith(".gov"):
        return True
    return False


def _search_bing(query: str, count: int = 6) -> List[Dict[str, str]]:
    try:
        is_cjk = bool(re.search(r'[\u4e00-\u9fff]', query))
        params = {"q": query, "count": count}
        if not is_cjk:
            params.update({"setlang": "en-US", "cc": "US"})
        url = "https://www.bing.com/search"
        resp = requests.get(url, params=params, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        results = []
        seen = set()
        for li in soup.select("li.b_algo"):
            a = li.select_one("h2 a")
            if not a:
                continue
            href = a.get("href", "")
            if not href.startswith("http") or href in seen:
                continue
            seen.add(href)
            title = a.get_text(strip=True)[:120]
            snippet_el = li.select_one(".b_caption p, .b_lineclamp2")
            snippet = snippet_el.get_text(strip=True)[:300] if snippet_el else ""
            entry = {"title": title, "url": href, "snippet": snippet, "engine": "bing"}
            if not _is_junk(entry):
                results.append(entry)
            if len(results) >= count:
                break
        return results
    except Exception as e:
        logger.warning(f"Bing search failed: {e}")
        return []


def _search_360(query: str, count: int = 6) -> List[Dict[str, str]]:
    try:
        resp = requests.get("https://www.so.com/s", params={"q": query}, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        results = []
        seen = set()
        for li in soup.select(".result-list li, .result"):
            a = li.select_one("h3 a")
            if not a:
                continue
            href = a.get("href", "")
            if not href or href in seen:
                continue
            seen.add(href)
            title = a.get_text(strip=True)[:120]
            snippet_el = li.select_one(".masonry-text, .res-desc")
            snippet = snippet_el.get_text(strip=True)[:300] if snippet_el else ""
            entry = {"title": title, "url": href, "snippet": snippet, "engine": "360"}
            if not _is_junk(entry):
                results.append(entry)
            if len(results) >= count:
                break
        return results
    except Exception as e:
        logger.warning(f"360 search failed: {e}")
        return []


async def search_bing_batch(queries: List[str], max_per_query: int = 6) -> List[Dict[str, str]]:
    all_results = []
    seen_urls = set()

    for query in queries:
        loop = asyncio.get_running_loop()
        bing_task = loop.run_in_executor(None, _search_bing, query, max_per_query)
        so_task = loop.run_in_executor(None, _search_360, query, max_per_query)
        bing_results, so_results = await asyncio.gather(bing_task, so_task)

        for entry in bing_results + so_results:
            url = entry["url"].rstrip("/")
            if url not in seen_urls:
                seen_urls.add(url)
                all_results.append(entry)

    return all_results


async def search_bing(query: str, max_results: int = 10) -> List[Dict[str, str]]:
    return await search_bing_batch([query], max_per_query=max_results)