74 lines
2.7 KiB
Python
74 lines
2.7 KiB
Python
from typing import List, Dict, Optional
|
|
import httpx
|
|
import json
|
|
import logging
|
|
from app.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
GOOGLE_CSE_URL = "https://www.googleapis.com/customsearch/v1"
|
|
|
|
IGNORE_DOMAINS = [
|
|
"google.com", "facebook.com", "twitter.com", "instagram.com",
|
|
"youtube.com", "reddit.com", "amazon.com", "ebay.com",
|
|
"wikipedia.org", "linkedin.com", "pinterest.com", "baidu.com",
|
|
"bing.com", "duckduckgo.com",
|
|
]
|
|
|
|
|
|
async def search_companies(query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
|
api_key = settings.GOOGLE_API_KEY or ""
|
|
cse_id = settings.GOOGLE_CSE_ID or ""
|
|
if api_key and cse_id:
|
|
return await _google_cse(query, max_results, api_key, cse_id)
|
|
logger.info("Google CSE not configured, using template results")
|
|
return []
|
|
|
|
|
|
async def _google_cse(query: str, max_results: int, api_key: str, cse_id: str) -> List[Dict[str, str]]:
|
|
try:
|
|
async with httpx.AsyncClient(timeout=15.0) as client:
|
|
resp = await client.get(GOOGLE_CSE_URL, params={
|
|
"key": api_key,
|
|
"cx": cse_id,
|
|
"q": query,
|
|
"num": min(max_results, 10),
|
|
"lr": "lang_en",
|
|
})
|
|
if resp.status_code != 200:
|
|
logger.warning(f"Google CSE returned {resp.status_code}")
|
|
return []
|
|
data = resp.json()
|
|
results = []
|
|
for item in data.get("items", []):
|
|
url = item.get("link", "")
|
|
if not url or any(d in url for d in IGNORE_DOMAINS):
|
|
continue
|
|
results.append({
|
|
"title": item.get("title", url)[:100],
|
|
"url": url.rstrip("/"),
|
|
"snippet": item.get("snippet", "")[:200],
|
|
})
|
|
return results[:max_results]
|
|
except Exception as e:
|
|
logger.warning(f"Google CSE failed: {e}")
|
|
return []
|
|
|
|
|
|
async def fetch_page_text(url: str) -> Optional[str]:
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
|
|
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
|
if resp.status_code == 200:
|
|
from bs4 import BeautifulSoup
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
|
tag.decompose()
|
|
text = soup.get_text(separator=" ", strip=True)
|
|
import re
|
|
text = re.sub(r"\s+", " ", text)[:3000]
|
|
return text if len(text) > 100 else None
|
|
except Exception as e:
|
|
logger.debug(f"fetch {url} failed: {e}")
|
|
return None
|