fix: prevent crashes on invalid paths and silent CDX JSON parse errors

This commit is contained in:
User
2025-09-29 10:27:38 +02:00
parent f840c4a8f1
commit a6f5ee5e1c
3 changed files with 95 additions and 110 deletions

View File

@@ -50,8 +50,12 @@ This webarchive website downloader has an interactive interface, supports downlo
```bash ```bash
git clone https://github.com/birbwatcher/wayback-machine-downloader.git git clone https://github.com/birbwatcher/wayback-machine-downloader.git
```
go to inner folder "wayback-machine-downloader"
```bash
cd wayback-machine-downloader cd wayback-machine-downloader
```
```bash
# Install dependencies # Install dependencies
npm install npm install
``` ```

View File

@@ -1,5 +1,5 @@
/* /*
* Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM) * Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
* Run: node downloader.js * Run: node downloader.js
*/ */
@@ -21,9 +21,7 @@ function renderProgress(current, total) {
const ratio = total > 0 ? current / total : 0; const ratio = total > 0 ? current / total : 0;
const filled = Math.round(ratio * width); const filled = Math.round(ratio * width);
const bar = "█".repeat(filled) + "-".repeat(width - filled); const bar = "█".repeat(filled) + "-".repeat(width - filled);
process.stdout.write( process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
);
if (current === total) process.stdout.write("\n"); if (current === total) process.stdout.write("\n");
} }
@@ -47,17 +45,12 @@ function isHtmlFile(filePath, contentType, firstBytes) {
const ext = path.extname(filePath).toLowerCase(); const ext = path.extname(filePath).toLowerCase();
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true; if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
const head = (firstBytes || "").toString("utf8", 0, 512); const head = (firstBytes || "").toString("utf8", 0, 512);
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head); return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
} }
// ----------------------------- Archive API ----------------------------- // ----------------------------- Archive API -----------------------------
async function getRawListFromApi({ async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
baseUrl,
pageIndex,
all,
fromTimestamp,
toTimestamp,
}) {
const cdx = new URL("https://web.archive.org/cdx/search/xd"); const cdx = new URL("https://web.archive.org/cdx/search/xd");
const params = new URLSearchParams(); const params = new URLSearchParams();
params.set("output", "json"); params.set("output", "json");
@@ -66,43 +59,41 @@ async function getRawListFromApi({
params.set("collapse", "digest"); params.set("collapse", "digest");
params.set("gzip", "false"); params.set("gzip", "false");
if (!all) params.append("filter", "statuscode:200"); if (!all) params.append("filter", "statuscode:200");
if (fromTimestamp && Number(fromTimestamp) !== 0) if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
params.set("from", String(fromTimestamp)); if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
if (toTimestamp && Number(toTimestamp) !== 0)
params.set("to", String(toTimestamp));
if (pageIndex != null) params.set("page", String(pageIndex)); if (pageIndex != null) params.set("page", String(pageIndex));
cdx.search = params.toString(); cdx.search = params.toString();
try { try {
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" }); const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
const text = await res.text(); const text = await res.text();
const json = JSON.parse(text); let json = [];
if ( try {
Array.isArray(json) && json = JSON.parse(text);
Array.isArray(json[0]) && } catch {
json[0].join(",") === "timestamp,original" // silent: treat as empty page
) { return [];
}
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
json.shift(); json.shift();
} }
return json || []; return json || [];
} catch (e) { } catch {
console.log(`ERROR getRawListFromApi: ${e}`); // silent: skip broken page
return []; return [];
} }
} }
// ----------------------------- DOWNLOADER CLASS ----------------------------- // ----------------------------- DOWNLOADER CLASS -----------------------------
class WaybackMachineDownloader { class WaybackMachineDownloader {
constructor(params) { constructor(params) {
this.base_url = params.base_url; this.base_url = params.base_url;
this.exact_url = !!params.exact_url; this.exact_url = !!params.exact_url;
this.directory = params.directory || null; this.directory = params.directory || null;
this.from_timestamp = params.from_timestamp this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
? Number(params.from_timestamp)
: 0;
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0; this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
this.threads_count = this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
params.threads_count != null ? Number(params.threads_count) : 3;
this.download_external_assets = params.download_external_assets || false; this.download_external_assets = params.download_external_assets || false;
@@ -113,49 +104,38 @@ class WaybackMachineDownloader {
this._processed = 0; this._processed = 0;
} }
// Create a human-readable backup folder name, preserving IDNs
backup_name() { backup_name() {
try { try {
if (this.base_url.includes("//")) { if (this.base_url.includes("//")) {
const u = new URL(this.base_url); const u = new URL(this.base_url);
return domainToUnicode(u.host); // use human-readable domain return domainToUnicode(u.host);
} }
} catch {} } catch {}
return this.base_url; return this.base_url;
} }
// Resolve output directory
backup_path() { backup_path() {
if (this.directory) { if (this.directory) {
return this.directory.endsWith(path.sep) return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
? this.directory
: this.directory + path.sep;
} }
return path.join("websites", this.backup_name(), path.sep); return path.join("websites", this.backup_name(), path.sep);
} }
// Fetch and merge snapshot lists
async get_all_snapshots_to_consider() { async get_all_snapshots_to_consider() {
console.log("Getting snapshot pages"); console.log("Getting snapshot pages");
const httpOpts = { const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
all: true,
fromTimestamp: this.from_timestamp,
toTimestamp: this.to_timestamp,
};
let list = []; let list = [];
list = list.concat( list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
);
process.stdout.write("."); process.stdout.write(".");
if (!this.exact_url) { if (!this.exact_url) {
const wildcard = this.base_url.endsWith("/*") const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
? this.base_url
: this.base_url.replace(/\/*$/, "") + "/*";
for (let i = 0; i < 100; i++) { for (let i = 0; i < 100; i++) {
const batch = await getRawListFromApi({ const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
baseUrl: wildcard,
pageIndex: i,
...httpOpts,
});
if (!batch || batch.length === 0) break; if (!batch || batch.length === 0) break;
list = list.concat(batch); list = list.concat(batch);
process.stdout.write("."); process.stdout.write(".");
@@ -165,12 +145,15 @@ class WaybackMachineDownloader {
return list; return list;
} }
// Choose the latest timestamp per unique pathname
async get_file_list_by_timestamp() { async get_file_list_by_timestamp() {
const curated = new Map(); const curated = new Map();
const all = await this.get_all_snapshots_to_consider(); const all = await this.get_all_snapshots_to_consider();
for (const pair of all) { for (const pair of all) {
const ts = pair[0]; const ts = pair && pair[0];
const url = pair[1]; const url = pair && pair[1];
if (!ts || !url) continue;
try { try {
const u = new URL(url); const u = new URL(url);
const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
@@ -185,13 +168,13 @@ class WaybackMachineDownloader {
return arr; return arr;
} }
// Replace Windows-hostile characters when running on Windows
_windowsSanitize(p) { _windowsSanitize(p) {
if (process.platform !== "win32") return p; if (process.platform !== "win32") return p;
return p.replace(/[:*?&=<>\\|]/g, (s) => return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
"%" + s.charCodeAt(0).toString(16)
);
} }
// Ensure directory exists
async _structure_dir_path(dir_path) { async _structure_dir_path(dir_path) {
try { try {
await mkdir(dir_path, { recursive: true }); await mkdir(dir_path, { recursive: true });
@@ -200,9 +183,10 @@ class WaybackMachineDownloader {
} }
} }
// Compute local file paths for a given archived URL
_determine_paths(file_url, file_id) { _determine_paths(file_url, file_id) {
if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) if (!file_url || !file_id) return null;
return null; if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
if (file_id.length > 200) return null; if (file_id.length > 200) return null;
const backup = this.backup_path(); const backup = this.backup_path();
@@ -212,15 +196,15 @@ class WaybackMachineDownloader {
if (file_id === "") { if (file_id === "") {
dir_path = backup; dir_path = backup;
file_path = path.join(backup, "index.html"); file_path = path.join(backup, "index.html");
} else if (
file_url.endsWith("/") ||
!parts[parts.length - 1].includes(".")
) {
dir_path = path.join(backup, ...parts);
file_path = path.join(dir_path, "index.html");
} else { } else {
dir_path = path.join(backup, ...parts.slice(0, -1)); const lastPart = parts[parts.length - 1] || "";
file_path = path.join(backup, ...parts); if (file_url.endsWith("/") || !lastPart.includes(".")) {
dir_path = path.join(backup, ...parts);
file_path = path.join(dir_path, "index.html");
} else {
dir_path = path.join(backup, ...parts.slice(0, -1));
file_path = path.join(backup, ...parts);
}
} }
dir_path = this._windowsSanitize(dir_path); dir_path = this._windowsSanitize(dir_path);
@@ -229,6 +213,8 @@ class WaybackMachineDownloader {
return { dir_path, file_path }; return { dir_path, file_path };
} }
// Download a single asset (img/css/js/etc.) referenced from an HTML page
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) { async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
try { try {
if (fs.existsSync(file_path)) return file_path; if (fs.existsSync(file_path)) return file_path;
@@ -261,21 +247,20 @@ class WaybackMachineDownloader {
} }
} }
// Parse saved HTML, optionally rewrite internal links to relative and fetch assets
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) { async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
try { try {
const backupRoot = this.backup_path(); const backupRoot = this.backup_path();
let html = fs.readFileSync(htmlPath, "utf8"); let html = fs.readFileSync(htmlPath, "utf8");
const $ = load(html); const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
const site = new URL(this.base_url); const site = new URL(this.base_url);
const siteHost = domainToUnicode(site.hostname.replace(/^www\./, "")); const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
const baseDir = path.dirname(htmlPath); const baseDir = path.dirname(htmlPath);
const downloadTasks = []; const downloadTasks = [];
// ----------- ASSETS ----------- // ----------- ASSETS -----------
$( $("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
"img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
).each((_, el) => {
const attr = el.tagName === "link" ? "href" : "src"; const attr = el.tagName === "link" ? "href" : "src";
const val = $(el).attr(attr); const val = $(el).attr(attr);
if (!val) return; if (!val) return;
@@ -283,12 +268,17 @@ class WaybackMachineDownloader {
try { try {
const abs = new URL(val, pageUrl).toString(); const abs = new URL(val, pageUrl).toString();
const u = new URL(abs); const u = new URL(abs);
const isInternal = const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
if (isInternal || this.download_external_assets) { if (isInternal || this.download_external_assets) {
const file_id = decodeURIComponent(u.pathname); const file_id = decodeURIComponent(u.pathname);
const paths = this._determine_paths(abs, file_id); let paths;
try {
paths = this._determine_paths(abs, file_id);
} catch (e) {
console.log(`Invalid path for asset ${abs}: ${e}`);
return;
}
if (!paths) return; if (!paths) return;
const { dir_path, file_path } = paths; const { dir_path, file_path } = paths;
@@ -300,9 +290,7 @@ class WaybackMachineDownloader {
} }
if (!fs.existsSync(file_path)) { if (!fs.existsSync(file_path)) {
downloadTasks.push( downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
this._download_asset(abs, pageTimestamp, file_path, dir_path)
);
} }
} }
} catch {} } catch {}
@@ -318,8 +306,7 @@ class WaybackMachineDownloader {
try { try {
const abs = new URL(val, pageUrl).toString(); const abs = new URL(val, pageUrl).toString();
const u = new URL(abs); const u = new URL(abs);
const isInternal = const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
if (isInternal) { if (isInternal) {
const normPath = decodeURIComponent(u.pathname) + (u.hash || ""); const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
@@ -343,17 +330,30 @@ class WaybackMachineDownloader {
} }
} }
// Download one file from the snapshot list (page or asset saved by CDX)
async _download_single(file_remote_info, total) { async _download_single(file_remote_info, total) {
const file_url = String(file_remote_info.file_url); const file_url = String(file_remote_info.file_url);
const file_id = file_remote_info.file_id; const file_id = file_remote_info.file_id;
const file_timestamp = file_remote_info.timestamp; const file_timestamp = file_remote_info.timestamp;
const paths = this._determine_paths(file_url, file_id);
let paths;
try {
paths = this._determine_paths(file_url, file_id);
} catch (e) {
console.log(`Invalid path for ${file_url}: ${e}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
if (!paths) { if (!paths) {
console.log(`Skipping invalid URL: ${file_url}`); console.log(`Skipping invalid URL: ${file_url}`);
this._processed++; this._processed++;
renderProgress(this._processed, total); renderProgress(this._processed, total);
return; return;
} }
const { dir_path, file_path } = paths; const { dir_path, file_path } = paths;
if (fs.existsSync(file_path)) { if (fs.existsSync(file_path)) {
@@ -387,11 +387,7 @@ class WaybackMachineDownloader {
const contentType = res.headers.get("content-type"); const contentType = res.headers.get("content-type");
const ext = path.extname(file_path).toLowerCase(); const ext = path.extname(file_path).toLowerCase();
const looksHtml = const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
isHtmlFile(file_path, contentType, null) ||
ext === "" ||
ext === ".html" ||
ext === ".htm";
if (looksHtml) { if (looksHtml) {
await this._process_html_assets(file_path, file_url, file_timestamp); await this._process_html_assets(file_path, file_url, file_timestamp);
} }
@@ -403,43 +399,33 @@ class WaybackMachineDownloader {
} }
} }
// Orchestrate downloads with concurrency
async download_files() { async download_files() {
const startTime = Date.now(); const startTime = Date.now();
console.log( console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
);
const list = await this.get_file_list_by_timestamp(); const list = await this.get_file_list_by_timestamp();
if (list.length === 0) { if (list.length === 0) {
console.log("No files to download."); console.log("No files to download.");
return; return;
} }
const concurrency = const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
const limit = pLimit(concurrency); const limit = pLimit(concurrency);
this._processed = 0; this._processed = 0;
await Promise.all( await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
list.map((info) => limit(() => this._download_single(info, list.length)))
);
const endTime = Date.now(); const endTime = Date.now();
console.log( console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
2
)}s, saved in ${this.backup_path()} (${list.length} files)`
);
} }
} }
// ============================= INTERACTIVE RUN ============================= // ============================= INTERACTIVE RUN =============================
function ask(rl, question) { function ask(rl, question) {
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim()))); return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
} }
async function interactiveMain() { async function interactiveMain() {
const rl = readline.createInterface({ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
input: process.stdin,
output: process.stdout,
});
let base_url; let base_url;
while (true) { while (true) {
@@ -463,20 +449,15 @@ async function interactiveMain() {
let canonical_action = "keep"; let canonical_action = "keep";
if (rewrite_mode === "relative") { if (rewrite_mode === "relative") {
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": '); const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
if ((c || "").toLowerCase() === "remove") canonical_action = "remove"; if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
} }
let threads_count = await ask(rl, "How many download threads? (default 3): "); let threads_count = await ask(rl, "How many download threads? (default 3): ");
threads_count = parseInt(threads_count || "3", 10); threads_count = parseInt(threads_count || "3", 10);
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3; if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
const exact_url = /^y(es)?$/i.test( const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ") const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
);
const directory = await ask(
rl,
"Target directory (leave blank for default websites/<host>/): "
);
const ext = await ask(rl, "Download external assets? (yes/no, default no): "); const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
const download_external_assets = /^y(es)?$/i.test(ext); const download_external_assets = /^y(es)?$/i.test(ext);

View File

@@ -1,6 +1,6 @@
{ {
"name": "wayback-downloader", "name": "wayback-downloader",
"version": "0.2.0", "version": "0.2.1",
"description": "Interactive Wayback Machine downloader for archiving websites locally.", "description": "Interactive Wayback Machine downloader for archiving websites locally.",
"type": "module", "type": "module",
"main": "downloader.js", "main": "downloader.js",