Fix API errors and improve customer discovery with real web results
- Fix usage/stats 500: use Date() not datetime.date() for SQL cast - Fix customers 422: raise size limit to 1000 - Replace unreliable MCP client with direct Bing batch search for discovery - Batch all search queries in one browser session (faster) - Show real company names/URLs from Bing, not generic templates - Smart filter for non-business results (news, blogs, forums) - Fallback suggestions when search results are insufficient - Frontend: clickable contact URLs, provider indicator, better layout
This commit is contained in:
@@ -4,6 +4,7 @@ import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import List, Dict
|
||||
import functools
|
||||
|
||||
from mcp.server.fastmcp import FastMCP
|
||||
|
||||
@@ -11,79 +12,95 @@ logger = logging.getLogger(__name__)
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
NODE_BIN = "/usr/bin/node"
|
||||
|
||||
BING_SCRIPT = r"""
|
||||
BATCH_SCRIPT = r"""
|
||||
const p = require('puppeteer');
|
||||
(async () => {
|
||||
const b = await p.launch({headless:true,args:['--no-sandbox','--disable-setuid-sandbox','--disable-blink-features=AutomationControlled']});
|
||||
const page = await b.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setExtraHTTPHeaders({'Accept-Language':'en-US,en;q=0.9'});
|
||||
await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', {get:()=>undefined}); });
|
||||
const q = process.argv[process.argv.length - 2];
|
||||
const max = parseInt(process.argv[process.argv.length - 1] || '10', 10);
|
||||
const sk = ['bing.com','google.com','facebook.com','twitter.com','instagram.com','youtube.com','reddit.com','amazon.com','wikipedia.org','baidu.com','linkedin.com','pinterest.com','ebay.com','walmart.com','w3.org','whatsapp.com','wechat.com','qq.com','taobao.com','tmall.com','alibaba.com','alipay.com','dict','dictionary','translate','zhihu.com','baike.baidu.com','sogou.com','163.com','sohu.com','sina.com','iciba.com','cambridge','britannica','sciencedirect','mdpi.com','springer','wiley.com','acm.org','ieee.org','researchgate','semanticscholar','ncbi.nlm.nih','nature.com','oup.com','sagepub','tandfonline'];
|
||||
const queries = JSON.parse(process.argv[process.argv.length - 2]);
|
||||
const max = parseInt(process.argv[process.argv.length - 1] || '6', 10);
|
||||
const sk = ['bing.com','google.com','facebook.com','twitter.com','instagram.com','youtube.com','reddit.com','amazon.com','walmart.com','w3.org','whatsapp.com','wechat.com','qq.com','taobao.com','tmall.com','alipay.com','zhihu.com','baike.baidu.com','sogou.com','163.com','sohu.com','sina.com','iciba.com','cambridge','britannica','sciencedirect','mdpi.com','springer','wiley.com','acm.org','ieee.org','researchgate','semanticscholar','ncbi.nlm.nih','nature.com','oup.com','sagepub','tandfonline','pinterest','ebay','dictionary','translate'];
|
||||
|
||||
try {
|
||||
await page.goto('https://cn.bing.com/search?q=' + encodeURIComponent(q) + '&setlang=en-US&cc=US', {waitUntil:'domcontentloaded',timeout:10000});
|
||||
await page.waitForSelector('.b_algo', {timeout:5000}).catch(()=>{});
|
||||
const results = await page.evaluate((m, sk) => {
|
||||
const reCJK = /[\u4e00-\u9fff\u3400-\u4dbf]/;
|
||||
const found = []; const seen = new Set();
|
||||
document.querySelectorAll('li.b_algo').forEach(li => {
|
||||
const a = li.querySelector('h2 a'); if (!a) return;
|
||||
let url = (a.href || '').replace(/\/$/,'');
|
||||
if (!url.startsWith('http') || seen.has(url)) return;
|
||||
seen.add(url);
|
||||
if (sk.some(d => url.includes(d))) return;
|
||||
const hostname = url.replace(/^https?:\/\//,'').split('/')[0];
|
||||
if (hostname.endsWith('.cn') || hostname.endsWith('.com.cn') || hostname.endsWith('.edu') || hostname.endsWith('.ac')) return;
|
||||
const title = (a.textContent||'').trim().substring(0,100);
|
||||
if (reCJK.test(title)) return;
|
||||
const s = li.querySelector('.b_caption p, .b_lineclamp2');
|
||||
found.push({title, url, snippet:s?s.textContent.trim().substring(0,200):''});
|
||||
});
|
||||
return found.slice(0,m);
|
||||
}, max, sk);
|
||||
console.log(JSON.stringify(results));
|
||||
const b = await p.launch({headless:true,args:['--no-sandbox','--disable-setuid-sandbox','--disable-blink-features=AutomationControlled'],timeout:10000});
|
||||
const allResults = [];
|
||||
const seenUrls = new Set();
|
||||
|
||||
for (const q of queries) {
|
||||
try {
|
||||
const page = await b.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setExtraHTTPHeaders({'Accept-Language':'en-US,en;q=0.9'});
|
||||
|
||||
const url = 'https://www.bing.com/search?q=' + encodeURIComponent(q) + '&setlang=en-US&cc=US';
|
||||
await page.goto(url, {waitUntil:'domcontentloaded',timeout:8000});
|
||||
await page.waitForSelector('.b_algo', {timeout:4000}).catch(()=>{});
|
||||
|
||||
const results = await page.evaluate((m, sk) => {
|
||||
const found = []; const seen = new Set();
|
||||
document.querySelectorAll('li.b_algo').forEach(li => {
|
||||
const a = li.querySelector('h2 a'); if (!a) return;
|
||||
let url = (a.href || '').replace(/\/$/,'');
|
||||
if (!url.startsWith('http') || seen.has(url)) return;
|
||||
seen.add(url);
|
||||
if (sk.some(d => url.includes(d))) return;
|
||||
const hostname = url.replace(/^https?:\/\//,'').split('/')[0];
|
||||
if (hostname.endsWith('.edu') || hostname.endsWith('.ac') || hostname.endsWith('.gov')) return;
|
||||
const title = (a.textContent||'').trim().substring(0,100);
|
||||
const s = li.querySelector('.b_caption p, .b_lineclamp2');
|
||||
found.push({title, url, snippet:s?s.textContent.trim().substring(0,200):''});
|
||||
});
|
||||
return found.slice(0,m);
|
||||
}, max, sk);
|
||||
|
||||
for (const r of results) {
|
||||
if (!seenUrls.has(r.url)) {
|
||||
seenUrls.add(r.url);
|
||||
allResults.push(r);
|
||||
}
|
||||
}
|
||||
await page.close();
|
||||
} catch(e) { /* skip failed query */ }
|
||||
}
|
||||
console.log(JSON.stringify(allResults.slice(0, max * queries.length)));
|
||||
await b.close();
|
||||
} catch(e) { console.log('[]'); }
|
||||
await b.close();
|
||||
})();
|
||||
"""
|
||||
|
||||
|
||||
BING_SCRIPT_FILE = os.path.join(os.path.dirname(__file__), "_bing_search.js")
|
||||
BATCH_SCRIPT_FILE = os.path.join(os.path.dirname(__file__), "_batch_search.js")
|
||||
NODE_MODULES = os.path.join(PROJECT_ROOT, "node_modules")
|
||||
|
||||
|
||||
async def search_bing(query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
||||
async def search_bing_batch(queries: List[str], max_per_query: int = 6) -> List[Dict[str, str]]:
|
||||
loop = asyncio.get_running_loop()
|
||||
try:
|
||||
with open(BING_SCRIPT_FILE, "w") as f:
|
||||
f.write(BING_SCRIPT)
|
||||
with open(BATCH_SCRIPT_FILE, "w") as f:
|
||||
f.write(BATCH_SCRIPT)
|
||||
env = os.environ.copy()
|
||||
env["NODE_PATH"] = NODE_MODULES
|
||||
result = subprocess.run(
|
||||
[NODE_BIN, BING_SCRIPT_FILE, query, str(max_results)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
cwd=PROJECT_ROOT,
|
||||
env=env,
|
||||
fn = functools.partial(
|
||||
subprocess.run,
|
||||
[NODE_BIN, BATCH_SCRIPT_FILE, json.dumps(queries), str(max_per_query)],
|
||||
capture_output=True, text=True, timeout=120, cwd=PROJECT_ROOT, env=env,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"Bing search failed: {result.stderr[:300]}")
|
||||
return []
|
||||
result = await loop.run_in_executor(None, fn)
|
||||
for line in result.stdout.strip().split("\n"):
|
||||
line = line.strip()
|
||||
if line.startswith("["):
|
||||
return json.loads(line)
|
||||
return []
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("Bing search timed out")
|
||||
logger.warning("Bing batch search timed out")
|
||||
return []
|
||||
except (json.JSONDecodeError, Exception) as e:
|
||||
logger.warning(f"Bing search error: {e}")
|
||||
logger.warning(f"Bing batch search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def search_bing(query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
||||
return await search_bing_batch([query], max_per_query=max_results)
|
||||
|
||||
|
||||
mcp = FastMCP("trade-search", log_level="WARNING")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user