refactor: remove scripts/crawler.mjs (#53)

author: sigoden <sigoden@gmail.com> 2024-06-29 07:07:33 +0800
committer: GitHub <noreply@github.com> 2024-06-29 07:07:33 +0800
commit: 970ed06d2b5759e8d17aeed55fbfde591fb0ff84 (patch)
tree: 49bddf7a38f408abce5c73d4445c4b0117f5feb3 /scripts
parent: 59fe32ac42c7fa784e692386dd590d38ad93d4b4 (diff)
download: llm-functions-docker-970ed06d2b5759e8d17aeed55fbfde591fb0ff84.tar.gz
1 files changed, 0 insertions, 215 deletions
diff --git a/scripts/crawler.mjs b/scripts/crawler.mjs
deleted file mode 100755
index 4db2e00..0000000
--- a/scripts/crawler.mjs
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/env node
-
-/**
- * Crawler document website.
- * 
- * The script can be used in following scenarios:
- * 1. Generate knowledge.json for the agent
- * > node scripts/crawler.mjs https://github.com/reactjs/react.dev/tree/main/src/content/reference tmp/knowledge.json
- * 2. To be used as a `recursive_url` document loader of AIChat
- * > recursive_url: 'node <path-to-llm-functions>/scripts/crawler.mjs $1 $2'
- */
-
-// DEPS: npm i @octokit/rest cheerio  html-to-text node-fetch https-proxy-agent
-
-import { Octokit } from "@octokit/rest";
-import * as cheerio from "cheerio";
-import { URL } from "node:url";
-import { writeFileSync } from "node:fs";
-import { compile } from "html-to-text";
-import fetch from "node-fetch";
-import { HttpsProxyAgent } from "https-proxy-agent";
-
-const compiledConvert = compile({ wordwrap: false, selectors: [{ selector: 'a', options: { ignoreHref: true } }] });
-
-const MAX_DEPTH = parseInt(process.env.CRAWLER_MAX_DEPTH) || 3;;
-
-const MAX_CONCURRENT = parseInt(process.env.CRAWLER_MAX_CONCURRENT) || 5;
-
-const IGNORE_LINKS = new Set();
-
-const IGNORE_PATHS_ENDING_IN = [
-  "search.html",
-  "search",
-  "changelog",
-  "changelog.html",
-];
-
-let fetchOptions = {
-  headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" },
-};
-
-async function main() {
-  const [startUrlRaw, outfile] = process.argv.slice(2);
-  if (!startUrlRaw || !outfile) {
-    console.log("Usage: ./crawler.mjs <url> <outfile>");
-    process.exit(1);
-  }
-  if (startUrlRaw.startsWith("https://") && process.env["HTTPS_PROXY"]) {
-    fetchOptions["agent"] = new HttpsProxyAgent(process.env["HTTPS_PROXY"]);
-  }
-  let pages = [];
-  for await (const page of crawlPage(startUrlRaw, MAX_DEPTH)) {
-    pages.push(page);
-  }
-  const output = JSON.stringify(pages, null, 2);
-  writeFileSync(outfile, output);
-}
-
-/**
- * 
- * @param {String} startUrl 
- * @param {number} maxDepth 
- */
-async function* crawlPage(startUrlRaw, maxDepth = 3) {
-  if (!startUrlRaw.endsWith("/")) {
-    startUrlRaw += "/"
-  }
-  console.log("Starting crawl from: ", startUrlRaw, " - Max Depth: ", maxDepth);
-  const startUrl = new URL(startUrlRaw);
-  let paths = [{ path: startUrl.pathname, depth: 0 }];
-
-  if (startUrl.hostname === "github.com") {
-    const githubLinks = await crawlGithubRepo(startUrl);
-    paths = githubLinks.map((link) => ({
-      path: link,
-      depth: 1,
-    }));
-  }
-
-  let index = 0;
-  while (index < paths.length) {
-    const batch = paths.slice(index, index + MAX_CONCURRENT);
-
-    const promises = batch.map(({ path, depth }) =>
-      getLinksFromUrl(startUrlRaw, path).then((links) => ({
-        links,
-        path,
-        depth,
-      })),
-    );
-
-    const results = await Promise.all(promises);
-    for (const {
-      links: { markdown, links: linksArray },
-      path,
-      depth,
-    } of results) {
-      if (markdown !== "" && depth <= maxDepth) {
-        yield {
-          path: new URL(path, startUrl).toString(),
-          markdown,
-        };
-      }
-
-      if (depth < maxDepth) {
-        for (let link of linksArray) {
-          if (!paths.some((p) => p.path === link)) {
-            paths.push({ path: link, depth: depth + 1 });
-          }
-        }
-      }
-    }
-
-    index += batch.length;
-  }
-  console.log("Crawl completed");
-}
-
-/**
- * 
- * @param {import("node:url").Url} startUrl 
- * @returns 
- */
-async function crawlGithubRepo(startUrl) {
-  const octokit = new Octokit({
-    auth: undefined,
-  });
-
-  const [_, owner, repo, scope, branch, ...pathParts] = startUrl.pathname.split("/");
-  if (scope !== "tree" && !branch) {
-    throw new Error("Invalid Github URL. It must follow the format: https://github.com/<owner>/<repo>/tree/<branch>/<path>")
-  }
-  const rootPath = pathParts.join("/");
-
-  const tree = await octokit.request(
-    "GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
-    {
-      owner,
-      repo,
-      tree_sha: branch,
-      headers: {
-        "X-GitHub-Api-Version": "2022-11-28",
-      },
-      recursive: "true",
-    },
-  );
-
-  const paths = tree.data.tree
-    .filter((file) => file.type === "blob" && file.path?.endsWith(".md") && file.path.startsWith(rootPath))
-    .map(
-      (file) =>
-        `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`,
-    );
-
-  return paths;
-}
-
-/**
- * 
- * @param {String} startUrlRaw 
- * @param {String} path 
- * @returns 
- */
-async function getLinksFromUrl(startUrlRaw, path) {
-  const location = new URL(path, startUrlRaw).toString();
-
-  console.log(`Crawl ${location}`)
-
-  const response = await fetch(location, fetchOptions);
-  const html = await response.text();
-
-  let links = [];
-
-  if (startUrlRaw.includes("github.com")) {
-    return {
-      markdown: html,
-      links,
-    };
-  }
-
-  const $ = cheerio.load(html);
-
-  IGNORE_LINKS.add(path);
-  if (path.endsWith("/")) {
-    IGNORE_LINKS.add(`${path}index.html`);
-  }
-
-  $("a").each((_, element) => {
-    const href = $(element).attr("href");
-    if (!href) {
-      return;
-    }
-
-    const parsedUrl = new URL(href, startUrlRaw);
-    if (parsedUrl.toString().startsWith(startUrlRaw)) {
-      const link = parsedUrl.pathname;
-      if (
-        !IGNORE_LINKS.has(link) &&
-        !link.includes("#") &&
-        !IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
-      ) {
-        links.push(link);
-      }
-    }
-  });
-
-  links = [...new Set(links)];
-
-  return {
-    markdown: compiledConvert(html),
-    links,
-  };
-}
-
-main();
author	sigoden <sigoden@gmail.com>	2024-06-29 07:07:33 +0800
committer	GitHub <noreply@github.com>	2024-06-29 07:07:33 +0800
commit	970ed06d2b5759e8d17aeed55fbfde591fb0ff84 (patch)
tree	49bddf7a38f408abce5c73d4445c4b0117f5feb3 /scripts
parent	59fe32ac42c7fa784e692386dd590d38ad93d4b4 (diff)
download	llm-functions-docker-970ed06d2b5759e8d17aeed55fbfde591fb0ff84.tar.gz