docs: update project docs and clean up redundant files

- PROGRESS.md: update to 2026-05-29 with security hardening (T-005), 4-frontend architecture, AI provider refactoring, discovery features, landing page/referral/quota, desktop layout, admin AI management - AGENTS.md: add AI provider list (Alibaba/NVIDIA, removed Claude/DeepL/Local), DB-driven config, CSRF/rate-limit/CORS notes, admin_ai reload quirk - .env.example: sync with actual config, replace deprecated providers with current Sensenova/OpencodeGo/NVIDIA/Spark/Alibaba - docs/PROJECT_STATUS.md: archive (fully superseded by PROGRESS.md) - Remove generated JS files (_bing_search.js, _batch_search.js) - Remove empty directories (data/corpus, data/models) - Remove backend/.coverage (test artifact) - Fix services/.gitignore to cover _bing_search.js - Include pending AI provider DB admin feature (admin_ai, AIProvider model, AIProviders.vue, migration) and T-008 test report
2026-05-29 11:15:33 +08:00
parent c04fa2c19f
commit 5d2bced39f
31 changed files with 1933 additions and 816 deletions
@@ -1,122 +1,130 @@
 import asyncio
-import json
 import logging
-import os
-import subprocess
+import re
 from typing import List, Dict
-import functools
-
-from mcp.server.fastmcp import FastMCP
+import requests
+from bs4 import BeautifulSoup

 logger = logging.getLogger(__name__)
-PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
-NODE_BIN = "/usr/bin/node"

-BATCH_SCRIPT = r"""
-const p = require('puppeteer');
-(async () => {
-  const queries = JSON.parse(process.argv[process.argv.length - 2]);
-  const max = parseInt(process.argv[process.argv.length - 1] || '6', 10);
-  const sk = ['bing.com','google.com','facebook.com','twitter.com','instagram.com','youtube.com','reddit.com','amazon.com','walmart.com','w3.org','whatsapp.com','wechat.com','qq.com','taobao.com','tmall.com','alipay.com','zhihu.com','baike.baidu.com','sogou.com','163.com','sohu.com','sina.com','iciba.com','cambridge','britannica','sciencedirect','mdpi.com','springer','wiley.com','acm.org','ieee.org','researchgate','semanticscholar','ncbi.nlm.nih','nature.com','oup.com','sagepub','tandfonline','pinterest','ebay','dictionary','translate'];
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
+}

-  try {
-    const b = await p.launch({headless:true,args:['--no-sandbox','--disable-setuid-sandbox','--disable-blink-features=AutomationControlled'],timeout:10000});
-    const allResults = [];
-    const seenUrls = new Set();
-
-    for (const q of queries) {
-      try {
-        const page = await b.newPage();
-        await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
-        await page.setExtraHTTPHeaders({'Accept-Language':'en-US,en;q=0.9'});
-
-        const url = 'https://www.bing.com/search?q=' + encodeURIComponent(q) + '&setlang=en-US&cc=US';
-        await page.goto(url, {waitUntil:'domcontentloaded',timeout:8000});
-        await page.waitForSelector('.b_algo', {timeout:4000}).catch(()=>{});
-
-        const results = await page.evaluate((m, sk) => {
-          const found = []; const seen = new Set();
-          document.querySelectorAll('li.b_algo').forEach(li => {
-            const a = li.querySelector('h2 a'); if (!a) return;
-            let url = (a.href || '').replace(/\/$/,'');
-            if (!url.startsWith('http') || seen.has(url)) return;
-            seen.add(url);
-            if (sk.some(d => url.includes(d))) return;
-            const hostname = url.replace(/^https?:\/\//,'').split('/')[0];
-            if (hostname.endsWith('.edu') || hostname.endsWith('.ac') || hostname.endsWith('.gov')) return;
-            const title = (a.textContent||'').trim().substring(0,100);
-            const s = li.querySelector('.b_caption p, .b_lineclamp2');
-            found.push({title, url, snippet:s?s.textContent.trim().substring(0,200):''});
-          });
-          return found.slice(0,m);
-        }, max, sk);
-
-        for (const r of results) {
-          if (!seenUrls.has(r.url)) {
-            seenUrls.add(r.url);
-            allResults.push(r);
-          }
-        }
-        await page.close();
-      } catch(e) { /* skip failed query */ }
-    }
-    console.log(JSON.stringify(allResults.slice(0, max * queries.length)));
-    await b.close();
-  } catch(e) { console.log('[]'); }
-})();
-"""
+SKIP_DOMAINS = {
+    "iciba.com", "baike.baidu.com", "cambridge.org", "dictionary.cambridge.org",
+    "collinsdictionary.com", "dictionary.com", "merriam-webster.com",
+    "thesaurus.com", "britannica.com", "wikipedia.org", "wikihow.com",
+    "facebook.com", "twitter.com", "instagram.com", "youtube.com",
+    "reddit.com", "pinterest.com", "amazon.com", "ebay.com",
+    "walmart.com", "target.com", "bestbuy.com", "homedepot.com",
+    "linkedin.com", "bing.com", "google.com",
+}
+SKIP_TITLE_PATTERNS = [
+    r'^是什么意思$', r'^翻译$', r'^词典$', r'^字典$',
+    r'翻译$', r'^百度百科', r'^维基百科',
+]


-BATCH_SCRIPT_FILE = os.path.join(os.path.dirname(__file__), "_batch_search.js")
-NODE_MODULES = os.path.join(PROJECT_ROOT, "node_modules")
+def _is_junk(item: Dict[str, str]) -> bool:
+    url = item.get("url", "")
+    title = item.get("title", "")
+    hostname = url.replace("https://", "").replace("http://", "").split("/")[0]
+    if any(d in hostname for d in SKIP_DOMAINS):
+        return True
+    if any(d in url for d in SKIP_DOMAINS):
+        return True
+    for p in SKIP_TITLE_PATTERNS:
+        if re.search(p, title):
+            return True
+    if hostname.endswith(".edu") or hostname.endswith(".ac") or hostname.endswith(".gov"):
+        return True
+    return False
+
+
+def _search_bing(query: str, count: int = 6) -> List[Dict[str, str]]:
+    try:
+        is_cjk = bool(re.search(r'[\u4e00-\u9fff]', query))
+        params = {"q": query, "count": count}
+        if not is_cjk:
+            params.update({"setlang": "en-US", "cc": "US"})
+        url = "https://www.bing.com/search"
+        resp = requests.get(url, params=params, headers=HEADERS, timeout=10)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        results = []
+        seen = set()
+        for li in soup.select("li.b_algo"):
+            a = li.select_one("h2 a")
+            if not a:
+                continue
+            href = a.get("href", "")
+            if not href.startswith("http") or href in seen:
+                continue
+            seen.add(href)
+            title = a.get_text(strip=True)[:120]
+            snippet_el = li.select_one(".b_caption p, .b_lineclamp2")
+            snippet = snippet_el.get_text(strip=True)[:300] if snippet_el else ""
+            entry = {"title": title, "url": href, "snippet": snippet, "engine": "bing"}
+            if not _is_junk(entry):
+                results.append(entry)
+            if len(results) >= count:
+                break
+        return results
+    except Exception as e:
+        logger.warning(f"Bing search failed: {e}")
+        return []
+
+
+def _search_360(query: str, count: int = 6) -> List[Dict[str, str]]:
+    try:
+        resp = requests.get("https://www.so.com/s", params={"q": query}, headers=HEADERS, timeout=10)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        results = []
+        seen = set()
+        for li in soup.select(".result-list li, .result"):
+            a = li.select_one("h3 a")
+            if not a:
+                continue
+            href = a.get("href", "")
+            if not href or href in seen:
+                continue
+            seen.add(href)
+            title = a.get_text(strip=True)[:120]
+            snippet_el = li.select_one(".masonry-text, .res-desc")
+            snippet = snippet_el.get_text(strip=True)[:300] if snippet_el else ""
+            entry = {"title": title, "url": href, "snippet": snippet, "engine": "360"}
+            if not _is_junk(entry):
+                results.append(entry)
+            if len(results) >= count:
+                break
+        return results
+    except Exception as e:
+        logger.warning(f"360 search failed: {e}")
+        return []


 async def search_bing_batch(queries: List[str], max_per_query: int = 6) -> List[Dict[str, str]]:
-    loop = asyncio.get_running_loop()
-    try:
-        with open(BATCH_SCRIPT_FILE, "w") as f:
-            f.write(BATCH_SCRIPT)
-        env = os.environ.copy()
-        env["NODE_PATH"] = NODE_MODULES
-        fn = functools.partial(
-            subprocess.run,
-            [NODE_BIN, BATCH_SCRIPT_FILE, json.dumps(queries), str(max_per_query)],
-            capture_output=True, text=True, timeout=120, cwd=PROJECT_ROOT, env=env,
-        )
-        result = await loop.run_in_executor(None, fn)
-        for line in result.stdout.strip().split("\n"):
-            line = line.strip()
-            if line.startswith("["):
-                return json.loads(line)
-        return []
-    except subprocess.TimeoutExpired:
-        logger.warning("Bing batch search timed out")
-        return []
-    except (json.JSONDecodeError, Exception) as e:
-        logger.warning(f"Bing batch search error: {e}")
-        return []
+    all_results = []
+    seen_urls = set()
+
+    for query in queries:
+        loop = asyncio.get_running_loop()
+        bing_task = loop.run_in_executor(None, _search_bing, query, max_per_query)
+        so_task = loop.run_in_executor(None, _search_360, query, max_per_query)
+        bing_results, so_results = await asyncio.gather(bing_task, so_task)
+
+        for entry in bing_results + so_results:
+            url = entry["url"].rstrip("/")
+            if url not in seen_urls:
+                seen_urls.add(url)
+                all_results.append(entry)
+
+    return all_results


 async def search_bing(query: str, max_results: int = 10) -> List[Dict[str, str]]:
    return await search_bing_batch([query], max_per_query=max_results)
-
-
-mcp = FastMCP("trade-search", log_level="WARNING")
-
-
-@mcp.tool(
-    name="web_search",
-    description="Search the web for companies, buyers, or business information. Returns title, URL, and snippet for each result. Useful for finding potential customers, researching companies, or gathering market intelligence.",
-)
-async def web_search(query: str, max_results: int = 10) -> str:
-    results = await search_bing(query, max_results)
-    if not results:
-        return json.dumps({"results": [], "error": None})
-    return json.dumps({"results": results, "error": None})
-
-
-def main():
-    asyncio.run(mcp.run_stdio_async())
-
-if __name__ == "__main__":
-    main()