#!/usr/bin/env node /** * Crawler document website. * * The script can be used in following scenarios: * 1. Generate knowledge.json for the agent * > node scripts/crawler.mjs https://github.com/reactjs/react.dev/tree/main/src/content/reference tmp/knowledge.json * 2. To be used as a `recursive_url` document loader of AIChat * > recursive_url: 'node /scripts/crawler.mjs $1 $2' */ // DEPS: npm i @octokit/rest cheerio html-to-text node-fetch https-proxy-agent import { Octokit } from "@octokit/rest"; import * as cheerio from "cheerio"; import { URL } from "node:url"; import { writeFileSync } from "node:fs"; import { compile } from "html-to-text"; import fetch from "node-fetch"; import { HttpsProxyAgent } from "https-proxy-agent"; const compiledConvert = compile({ wordwrap: false, selectors: [{ selector: 'a', options: { ignoreHref: true } }] }); const MAX_DEPTH = parseInt(process.env.CRAWLER_MAX_DEPTH) || 3;; const MAX_CONCURRENT = parseInt(process.env.CRAWLER_MAX_CONCURRENT) || 5; const IGNORE_LINKS = new Set(); const IGNORE_PATHS_ENDING_IN = [ "search.html", "search", "changelog", "changelog.html", ]; let fetchOptions = { headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" }, }; async function main() { const [startUrlRaw, outfile] = process.argv.slice(2); if (!startUrlRaw || !outfile) { console.log("Usage: ./crawler.mjs "); process.exit(1); } if (startUrlRaw.startsWith("https://") && process.env["HTTPS_PROXY"]) { fetchOptions["agent"] = new HttpsProxyAgent(process.env["HTTPS_PROXY"]); } let pages = []; for await (const page of crawlPage(startUrlRaw, MAX_DEPTH)) { pages.push(page); } const output = JSON.stringify(pages, null, 2); writeFileSync(outfile, output); } /** * * @param {String} startUrl * @param {number} maxDepth */ async function* crawlPage(startUrlRaw, maxDepth = 3) { if (!startUrlRaw.endsWith("/")) { startUrlRaw += "/" } console.log("Starting crawl from: ", startUrlRaw, " - Max Depth: ", maxDepth); const startUrl = new URL(startUrlRaw); let paths = [{ path: startUrl.pathname, depth: 0 }]; if (startUrl.hostname === "github.com") { const githubLinks = await crawlGithubRepo(startUrl); paths = githubLinks.map((link) => ({ path: link, depth: 1, })); } let index = 0; while (index < paths.length) { const batch = paths.slice(index, index + MAX_CONCURRENT); const promises = batch.map(({ path, depth }) => getLinksFromUrl(startUrlRaw, path).then((links) => ({ links, path, depth, })), ); const results = await Promise.all(promises); for (const { links: { markdown, links: linksArray }, path, depth, } of results) { if (markdown !== "" && depth <= maxDepth) { yield { path: new URL(path, startUrl).toString(), markdown, }; } if (depth < maxDepth) { for (let link of linksArray) { if (!paths.some((p) => p.path === link)) { paths.push({ path: link, depth: depth + 1 }); } } } } index += batch.length; } console.log("Crawl completed"); } /** * * @param {import("node:url").Url} startUrl * @returns */ async function crawlGithubRepo(startUrl) { const octokit = new Octokit({ auth: undefined, }); const [_, owner, repo, scope, branch, ...pathParts] = startUrl.pathname.split("/"); if (scope !== "tree" && !branch) { throw new Error("Invalid Github URL. It must follow the format: https://github.com///tree//") } const rootPath = pathParts.join("/"); const tree = await octokit.request( "GET /repos/{owner}/{repo}/git/trees/{tree_sha}", { owner, repo, tree_sha: branch, headers: { "X-GitHub-Api-Version": "2022-11-28", }, recursive: "true", }, ); const paths = tree.data.tree .filter((file) => file.type === "blob" && file.path?.endsWith(".md") && file.path.startsWith(rootPath)) .map( (file) => `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`, ); return paths; } /** * * @param {String} startUrlRaw * @param {String} path * @returns */ async function getLinksFromUrl(startUrlRaw, path) { const location = new URL(path, startUrlRaw).toString(); console.log(`Crawl ${location}`) const response = await fetch(location, fetchOptions); const html = await response.text(); let links = []; if (startUrlRaw.includes("github.com")) { return { markdown: html, links, }; } const $ = cheerio.load(html); IGNORE_LINKS.add(path); if (path.endsWith("/")) { IGNORE_LINKS.add(`${path}index.html`); } $("a").each((_, element) => { const href = $(element).attr("href"); if (!href) { return; } const parsedUrl = new URL(href, startUrlRaw); if (parsedUrl.toString().startsWith(startUrlRaw)) { const link = parsedUrl.pathname; if ( !IGNORE_LINKS.has(link) && !link.includes("#") && !IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending)) ) { links.push(link); } } }); links = [...new Set(links)]; return { markdown: compiledConvert(html), links, }; } main();