From 2ee24731dc39985c5ed2014dc38edf44033af51d Mon Sep 17 00:00:00 2001 From: sigoden Date: Wed, 17 Jul 2024 09:23:52 +0800 Subject: refactor: search tools (#76) --- tools/search_arxiv.sh | 2 +- tools/search_bing.sh | 6 +++--- tools/search_brave.sh | 6 +++--- tools/search_duckduckgo.sh | 26 +++++++++++++++++++++++--- tools/search_exa.sh | 6 +++--- tools/search_google.sh | 4 ++-- tools/search_jina.sh | 4 ++-- tools/search_searxng.sh | 4 ++-- tools/search_tavily.sh | 6 +++--- 9 files changed, 42 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/search_arxiv.sh b/tools/search_arxiv.sh index 6ffa8cf..2aa7966 100755 --- a/tools/search_arxiv.sh +++ b/tools/search_arxiv.sh @@ -3,7 +3,7 @@ set -e # @describe Search arXiv for a query and return the top papers. -# @env ARXIV_MAX_RESULTS=5 The max results to return. +# @env ARXIV_MAX_RESULTS=3 The max results to return. # @option --query! The query to search for. main() { diff --git a/tools/search_bing.sh b/tools/search_bing.sh index 4ab1117..dd60e0c 100755 --- a/tools/search_bing.sh +++ b/tools/search_bing.sh @@ -5,15 +5,15 @@ set -e # Use this when you need current information or feel a search could provide a better answer. # @env BING_API_KEY! The api key -# @env BING_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { encoded_query="$(jq -nr --arg q "$argc_query" '$q|@uri')" - url="https://api.bing.microsoft.com/v7.0/search?q=$encoded_query&mkt=en-us&textdecorations=true&textformat=raw&count=$BING_MAX_RESULTS&offset=0" + url="https://api.bing.microsoft.com/v7.0/search?q=$encoded_query&mkt=en-us&textdecorations=true&textformat=raw&count=$SEARCH_MAX_RESULTS&offset=0" curl -fsSL "$url" \ -H "Ocp-Apim-Subscription-Key: $BING_API_KEY" | \ - jq '[.webPages.value[] | {name: .name, url: .url, snippet: .snippet}]' \ + jq '[.webPages.value[] | {link: .url, title: .name, snippet: .snippet}]' \ >> "$LLM_OUTPUT" } diff --git a/tools/search_brave.sh b/tools/search_brave.sh index 9886944..f76e9ca 100755 --- a/tools/search_brave.sh +++ b/tools/search_brave.sh @@ -5,16 +5,16 @@ set -e # Use this when you need current information or feel a search could provide a better answer. # @env BRAVE_API_KEY! The api key -# @env BRAVE_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { encoded_query="$(jq -nr --arg q "$argc_query" '$q|@uri')" - url="https://api.search.brave.com/res/v1/web/search?q=$encoded_query&count=$BRAVE_MAX_RESULTS" + url="https://api.search.brave.com/res/v1/web/search?q=$encoded_query&count=$SEARCH_MAX_RESULTS" curl -fsSL "$url" \ -H "Accept: application/json" \ -H "X-Subscription-Token: $BRAVE_API_KEY" | \ - jq '[.web.results[] | {title: .title, url: .url, description: .description}]' \ + jq '[.web.results[] | {link: .url, title: .title, snippet: .description}]' \ >> "$LLM_OUTPUT" } diff --git a/tools/search_duckduckgo.sh b/tools/search_duckduckgo.sh index ad57b50..062bfb5 100755 --- a/tools/search_duckduckgo.sh +++ b/tools/search_duckduckgo.sh @@ -4,12 +4,32 @@ set -e # @describe Perform a web search using DuckDuckGo API to get up-to-date information or additional context. # Use this when you need current information or feel a search could provide a better answer. -# @meta require-tools ddgr -# @env DDG_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { - ddgr -n $DDG_MAX_RESULTS --json "$argc_query" >> "$LLM_OUTPUT" + encoded_query="$(jq -nr --arg q "$argc_query" '$q|@uri')" + vqd="$(curl -fsSL -X POST https://duckduckgo.com -d "q=$encoded_query" | sed -En 's/.*vqd=([0-9-]+)&.*/\1/p')" + url="https://links.duckduckgo.com/d.js?q=$encoded_query&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=$vqd&bing_market=wt-WT&ex=-1" + data="$(curl -fsSL "$url" | sed -En 's/.*DDG.pageLayout.load\(\x27d\x27,\[(.*)\]\);DDG.duckbar.load\(.*/\1/p')" + echo "[$data]" | jq ' +def strip_tags: + gsub("<[^>]*>"; ""); + +def unescape_html_entities: + gsub("&"; "&") | + gsub("<"; "<") | + gsub(">"; ">") | + gsub("""; "\"") | + gsub("'"; "'\''") | + gsub("'"; "'\''") | + gsub(" "; " "); + +def normalize: strip_tags | unescape_html_entities; + +[.[:'"$SEARCH_MAX_RESULTS"'] | .[] | select(has("u")) | {link: .u, title: (.t | normalize), snippet: (.a | normalize)}] +' >> "$LLM_OUTPUT" + } eval "$(argc --argc-eval "$0" "$@")" diff --git a/tools/search_exa.sh b/tools/search_exa.sh index ccb14cf..f14d342 100755 --- a/tools/search_exa.sh +++ b/tools/search_exa.sh @@ -5,7 +5,7 @@ set -e # Use this when you need current information or feel a search could provide a better answer. # @env EXA_API_KEY! The api key -# @env EXA_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { @@ -15,7 +15,7 @@ main() { -d ' { "query": "'"$argc_query"'", - "numResults": '"$EXA_MAX_RESULTS"', + "numResults": '"$SEARCH_MAX_RESULTS"', "type": "keyword", "contents": { "text": { @@ -23,7 +23,7 @@ main() { } } }' | \ - jq '[.results[] | {title: .title, url: .url, text: .text}]' \ + jq '[.results[] | {link: .url, title: .title, snippet: .text}]' \ >> "$LLM_OUTPUT" } diff --git a/tools/search_google.sh b/tools/search_google.sh index 94a6789..8e82f89 100755 --- a/tools/search_google.sh +++ b/tools/search_google.sh @@ -6,14 +6,14 @@ set -e # @env GOOGLE_API_KEY! The api key # @env GOOGLE_CSE_ID! The id of google search engine -# @env GOOGLE_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { encoded_query="$(jq -nr --arg q "$argc_query" '$q|@uri')" url="https://www.googleapis.com/customsearch/v1?key=$GOOGLE_API_KEY&cx=$GOOGLE_CSE_ID&q=$encoded_query" curl -fsSL "$url" | \ - jq '[.items[:'"$GOOGLE_MAX_RESULTS"'] | .[] | {title: .title, link: .link, snippet: .snippet}]' \ + jq '[.items[:'"$SEARCH_MAX_RESULTS"'] | .[] | {link: .link, title: .title, snippet: .snippet}]' \ >> "$LLM_OUTPUT" } diff --git a/tools/search_jina.sh b/tools/search_jina.sh index 3e6d706..ce51a55 100755 --- a/tools/search_jina.sh +++ b/tools/search_jina.sh @@ -5,7 +5,7 @@ set -e # Use this when you need current information or feel a search could provide a better answer. # @env JINA_API_KEY The api key -# @env JINA_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { @@ -15,7 +15,7 @@ main() { fi encoded_query="$(jq -nr --arg q "$argc_query" '$q|@uri')" curl -fsSL "${curl_args[@]}" "https://s.jina.ai/$encoded_query" | \ - jq '[.data[:'"$JINA_MAX_RESULTS"'] | .[] | {title: .title, url: .url, description: .description}]' \ + jq '[.data[:'"$SEARCH_MAX_RESULTS"'] | .[] | {link: .url, title: .title, snippet: .description}]' \ >> "$LLM_OUTPUT" } diff --git a/tools/search_searxng.sh b/tools/search_searxng.sh index e171f58..7272182 100755 --- a/tools/search_searxng.sh +++ b/tools/search_searxng.sh @@ -5,14 +5,14 @@ set -e # Use this when you need current information or feel a search could provide a better answer. # @env SEARXNG_API_BASE! The api url -# @env SEARXNG_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { encoded_query="$(jq -nr --arg q "$argc_query" '$q|@uri')" url="$SEARXNG_API_BASE/search?q=$encoded_query&categories=general&language=en-US&format=json" curl -fsSL "$url" | \ - jq '[.results[:'"$SEARXNG_MAX_RESULTS"'] | .[] | {url: .url, title: .title, content: .content}]' \ + jq '[.results[:'"$SEARCH_MAX_RESULTS"'] | .[] | {link: .url, title: .title, snippet: .content}]' \ >> "$LLM_OUTPUT" } diff --git a/tools/search_tavily.sh b/tools/search_tavily.sh index 13a8520..fb1e367 100755 --- a/tools/search_tavily.sh +++ b/tools/search_tavily.sh @@ -5,7 +5,7 @@ set -e # Use this when you need current information or feel a search could provide a better answer. # @env TAVILY_API_KEY! The api key -# @env TAVILY_MAX_RESULTS=5 The max results to return. +# @env SEARCH_MAX_RESULTS=5 The max results to return. # @option --query! The query to search for. main() { @@ -16,9 +16,9 @@ main() { "api_key": "'"$TAVILY_API_KEY"'", "query": "'"$argc_query"'", "search_depth": "advanced", - "max_results": "'"$TAVILY_MAX_RESULTS"'" + "max_results": "'"$SEARCH_MAX_RESULTS"'" }' | \ - jq '[.results[] | {title: .title, url: .url, content: .content}]' \ + jq '[.results[] | {link: .url, title: .title, snippet: .content}]' \ >> "$LLM_OUTPUT" } -- cgit v1.2.3