mirror of
https://github.com/birbwatcher/wayback-machine-downloader.git
synced 2026-02-10 14:52:23 +00:00
init: initial project setup
This commit is contained in:
114
README.md
114
README.md
@@ -1 +1,113 @@
|
|||||||
# wayback-machine-downloader
|
# Wayback Machine Downloader JS
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
A script written in **Node.js** for downloading websites from [Web Archive](https://web.archive.org/).
|
||||||
|
|
||||||
|
Intended for use by:
|
||||||
|
- **Webmasters** — to restore their lost or hacked projects
|
||||||
|
- **OSINT researchers** — for local work with resources that no longer exist
|
||||||
|
|
||||||
|
This webarchive website downloader has an interactive interface, supports downloading with either original links preserved or rewritten into relative ones (for local usage).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Features of Web Archive Website Downloader
|
||||||
|
|
||||||
|
1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets.
|
||||||
|
2. Rewrite internal links for correct local browsing.
|
||||||
|
3. Multithreading support.
|
||||||
|
4. Save results into a chosen folder while keeping the original structure.
|
||||||
|
5. Ability to download external assets (e.g., images or scripts from a CDN).
|
||||||
|
|
||||||
|
#### Special Features
|
||||||
|
|
||||||
|
- The script fixes parameterized file names such as `main.css?ver=1.2` into `main.css` for proper local work.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Node.js version 18.x or higher
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/birbwatcher/wayback-machine-downloader.git
|
||||||
|
cd wayback-machine-downloader
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
node downloader.js
|
||||||
|
```
|
||||||
|
|
||||||
|
After launching, an interactive menu will appear with the following questions:
|
||||||
|
|
||||||
|
- base URL (e.g., https://example.com)
|
||||||
|
- date range (from/to)
|
||||||
|
- number of threads
|
||||||
|
- link rewriting mode (keep as-is or convert to relative)
|
||||||
|
- whether to remove `rel=canonical` from the downloaded site
|
||||||
|
- whether to download external assets
|
||||||
|
- directory for saving the files
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
node downloader.js
|
||||||
|
```
|
||||||
|
|
||||||
|
Dialog example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
Enter base URL to archive (e.g., https://example.com): https://example.com
|
||||||
|
From timestamp (YYYYMMDDhhmmss) or leave blank: 20200101000000
|
||||||
|
To timestamp (YYYYMMDDhhmmss) or leave blank: 20201231235959
|
||||||
|
Rewrite links? (yes=relative / no=as-is, default no): yes
|
||||||
|
Canonical: "keep" (default) or "remove": keep
|
||||||
|
How many download threads? (default 3): 5
|
||||||
|
Only exact URL (no wildcard /*)? (yes/no, default no): no
|
||||||
|
Target directory (leave blank for default websites/<host>/):
|
||||||
|
Download external assets? (yes/no, default no): no
|
||||||
|
```
|
||||||
|
|
||||||
|
After this, the archive download will begin.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Issues
|
||||||
|
|
||||||
|
#### Script downloads only the homepage
|
||||||
|
**Answer:** try specifying the base URL with `/*` at the end.
|
||||||
|
For example: `https://example.com/*`, or try downloading a different time range.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## (Important) Download responsibly
|
||||||
|
|
||||||
|
Please note that downloading third-party websites may violate copyright laws.
|
||||||
|
Use this tool responsibly and make sure not to break the law.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Pull requests are welcome!
|
||||||
|
For major changes, please open an issue first to discuss what you would like to change.
|
||||||
|
|
||||||
|
1. Fork the project
|
||||||
|
2. Create your feature branch (`git checkout -b feature/fooBar`)
|
||||||
|
3. Commit your changes (`git commit -am 'Add some fooBar'`)
|
||||||
|
4. Push to the branch (`git push origin feature/fooBar`)
|
||||||
|
5. Create a new Pull Request
|
||||||
|
|||||||
BIN
assets/webarchive-downloader.jpg
Normal file
BIN
assets/webarchive-downloader.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 23 KiB |
508
wayback-machine-downloader/downloader.js
Normal file
508
wayback-machine-downloader/downloader.js
Normal file
@@ -0,0 +1,508 @@
|
|||||||
|
/*
|
||||||
|
* Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM)
|
||||||
|
* Run: node downloader.js
|
||||||
|
*/
|
||||||
|
|
||||||
|
import fs from "fs";
|
||||||
|
import path from "path";
|
||||||
|
import { fileURLToPath, pathToFileURL } from "url";
|
||||||
|
import { mkdir } from "fs/promises";
|
||||||
|
import pLimit from "p-limit";
|
||||||
|
import { load } from "cheerio";
|
||||||
|
import { Readable } from "stream";
|
||||||
|
import readline from "readline";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
const __dirname = path.dirname(__filename);
|
||||||
|
|
||||||
|
// ----------------------------- PROGRESS BAR -----------------------------
|
||||||
|
function renderProgress(current, total) {
|
||||||
|
const width = 40;
|
||||||
|
const ratio = total > 0 ? current / total : 0;
|
||||||
|
const filled = Math.round(ratio * width);
|
||||||
|
const bar = "█".repeat(filled) + "-".repeat(width - filled);
|
||||||
|
process.stdout.write(
|
||||||
|
`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
|
||||||
|
);
|
||||||
|
if (current === total) process.stdout.write("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------- HELPERS -----------------------------
|
||||||
|
function toPosix(p) {
|
||||||
|
return p.split(path.sep).join("/");
|
||||||
|
}
|
||||||
|
function relativeLink(fromDir, toFile) {
|
||||||
|
const rel = path.relative(fromDir, toFile);
|
||||||
|
return toPosix(rel || path.basename(toFile));
|
||||||
|
}
|
||||||
|
function ensureLocalTargetForPath(pathname) {
|
||||||
|
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
|
||||||
|
? path.posix.join(pathname, "index.html")
|
||||||
|
: pathname;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------- HTML CHECK -----------------------------
|
||||||
|
function isHtmlFile(filePath, contentType, firstBytes) {
|
||||||
|
if (contentType && /text\/html/i.test(String(contentType))) return true;
|
||||||
|
const ext = path.extname(filePath).toLowerCase();
|
||||||
|
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
|
||||||
|
const head = (firstBytes || "").toString("utf8", 0, 512);
|
||||||
|
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------- Archive API -----------------------------
|
||||||
|
async function getRawListFromApi({
|
||||||
|
baseUrl,
|
||||||
|
pageIndex,
|
||||||
|
all,
|
||||||
|
fromTimestamp,
|
||||||
|
toTimestamp,
|
||||||
|
}) {
|
||||||
|
const cdx = new URL("https://web.archive.org/cdx/search/xd");
|
||||||
|
const params = new URLSearchParams();
|
||||||
|
params.set("output", "json");
|
||||||
|
params.set("url", baseUrl);
|
||||||
|
params.set("fl", "timestamp,original");
|
||||||
|
params.set("collapse", "digest");
|
||||||
|
params.set("gzip", "false");
|
||||||
|
if (!all) params.append("filter", "statuscode:200");
|
||||||
|
if (fromTimestamp && Number(fromTimestamp) !== 0)
|
||||||
|
params.set("from", String(fromTimestamp));
|
||||||
|
if (toTimestamp && Number(toTimestamp) !== 0)
|
||||||
|
params.set("to", String(toTimestamp));
|
||||||
|
if (pageIndex != null) params.set("page", String(pageIndex));
|
||||||
|
cdx.search = params.toString();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
|
||||||
|
const text = await res.text();
|
||||||
|
const json = JSON.parse(text);
|
||||||
|
if (
|
||||||
|
Array.isArray(json) &&
|
||||||
|
Array.isArray(json[0]) &&
|
||||||
|
json[0].join(",") === "timestamp,original"
|
||||||
|
) {
|
||||||
|
json.shift();
|
||||||
|
}
|
||||||
|
return json || [];
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`ERROR getRawListFromApi: ${e}`);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------- DOWNLOADER CLASS -----------------------------
|
||||||
|
class WaybackMachineDownloader {
|
||||||
|
constructor(params) {
|
||||||
|
this.base_url = params.base_url;
|
||||||
|
this.exact_url = !!params.exact_url;
|
||||||
|
this.directory = params.directory || null;
|
||||||
|
this.from_timestamp = params.from_timestamp
|
||||||
|
? Number(params.from_timestamp)
|
||||||
|
: 0;
|
||||||
|
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
|
||||||
|
this.threads_count =
|
||||||
|
params.threads_count != null ? Number(params.threads_count) : 3;
|
||||||
|
|
||||||
|
this.download_external_assets = params.download_external_assets || false;
|
||||||
|
|
||||||
|
this.rewrite_mode = params.rewrite_mode || "as-is";
|
||||||
|
this.rewrite_links = this.rewrite_mode === "relative";
|
||||||
|
this.canonical_action = params.canonical_action || "keep";
|
||||||
|
|
||||||
|
this._processed = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
backup_name() {
|
||||||
|
try {
|
||||||
|
if (this.base_url.includes("//")) {
|
||||||
|
const u = new URL(this.base_url);
|
||||||
|
return u.host;
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
return this.base_url;
|
||||||
|
}
|
||||||
|
backup_path() {
|
||||||
|
if (this.directory) {
|
||||||
|
return this.directory.endsWith(path.sep)
|
||||||
|
? this.directory
|
||||||
|
: this.directory + path.sep;
|
||||||
|
}
|
||||||
|
return path.join("websites", this.backup_name(), path.sep);
|
||||||
|
}
|
||||||
|
|
||||||
|
async get_all_snapshots_to_consider() {
|
||||||
|
console.log("Getting snapshot pages");
|
||||||
|
const httpOpts = {
|
||||||
|
all: true,
|
||||||
|
fromTimestamp: this.from_timestamp,
|
||||||
|
toTimestamp: this.to_timestamp,
|
||||||
|
};
|
||||||
|
let list = [];
|
||||||
|
|
||||||
|
list = list.concat(
|
||||||
|
await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
|
||||||
|
);
|
||||||
|
process.stdout.write(".");
|
||||||
|
|
||||||
|
if (!this.exact_url) {
|
||||||
|
const wildcard = this.base_url.endsWith("/*")
|
||||||
|
? this.base_url
|
||||||
|
: this.base_url.replace(/\/*$/, "") + "/*";
|
||||||
|
for (let i = 0; i < 100; i++) {
|
||||||
|
const batch = await getRawListFromApi({
|
||||||
|
baseUrl: wildcard,
|
||||||
|
pageIndex: i,
|
||||||
|
...httpOpts,
|
||||||
|
});
|
||||||
|
if (!batch || batch.length === 0) break;
|
||||||
|
list = list.concat(batch);
|
||||||
|
process.stdout.write(".");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log(` found ${list.length} snapshots to consider.\n`);
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
async get_file_list_by_timestamp() {
|
||||||
|
const curated = new Map();
|
||||||
|
const all = await this.get_all_snapshots_to_consider();
|
||||||
|
for (const pair of all) {
|
||||||
|
const ts = pair[0];
|
||||||
|
const url = pair[1];
|
||||||
|
try {
|
||||||
|
const u = new URL(url);
|
||||||
|
const file_id = u.pathname;
|
||||||
|
const prev = curated.get(file_id);
|
||||||
|
if (!prev || prev.timestamp <= ts) {
|
||||||
|
curated.set(file_id, { file_url: url, timestamp: ts, file_id });
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id }));
|
||||||
|
arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
|
||||||
|
return arr;
|
||||||
|
}
|
||||||
|
|
||||||
|
_windowsSanitize(p) {
|
||||||
|
if (process.platform !== "win32") return p;
|
||||||
|
return p.replace(/[:*?&=<>\\|]/g, (s) =>
|
||||||
|
"%" + s.charCodeAt(0).toString(16)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
async _structure_dir_path(dir_path) {
|
||||||
|
try {
|
||||||
|
await mkdir(dir_path, { recursive: true });
|
||||||
|
} catch (e) {
|
||||||
|
if (!e || e.code !== "EEXIST") throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_determine_paths(file_url, file_id) {
|
||||||
|
if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
|
||||||
|
return null;
|
||||||
|
if (file_id.length > 200) return null;
|
||||||
|
|
||||||
|
const backup = this.backup_path();
|
||||||
|
const parts = file_id.split("/").filter(Boolean);
|
||||||
|
let dir_path, file_path;
|
||||||
|
|
||||||
|
if (file_id === "") {
|
||||||
|
dir_path = backup;
|
||||||
|
file_path = path.join(backup, "index.html");
|
||||||
|
} else if (
|
||||||
|
file_url.endsWith("/") ||
|
||||||
|
!parts[parts.length - 1].includes(".")
|
||||||
|
) {
|
||||||
|
dir_path = path.join(backup, ...parts);
|
||||||
|
file_path = path.join(dir_path, "index.html");
|
||||||
|
} else {
|
||||||
|
dir_path = path.join(backup, ...parts.slice(0, -1));
|
||||||
|
file_path = path.join(backup, ...parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
dir_path = this._windowsSanitize(dir_path);
|
||||||
|
file_path = this._windowsSanitize(file_path);
|
||||||
|
|
||||||
|
return { dir_path, file_path };
|
||||||
|
}
|
||||||
|
|
||||||
|
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
|
||||||
|
try {
|
||||||
|
if (fs.existsSync(file_path)) return file_path;
|
||||||
|
|
||||||
|
await this._structure_dir_path(dir_path);
|
||||||
|
const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`;
|
||||||
|
let res;
|
||||||
|
try {
|
||||||
|
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!res.ok || !res.body) {
|
||||||
|
console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
const ws = fs.createWriteStream(file_path);
|
||||||
|
Readable.fromWeb(res.body).pipe(ws);
|
||||||
|
ws.on("finish", resolve);
|
||||||
|
ws.on("error", reject);
|
||||||
|
});
|
||||||
|
|
||||||
|
return file_path;
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Asset download failed: ${assetUrl} → ${e}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
|
||||||
|
try {
|
||||||
|
const backupRoot = this.backup_path();
|
||||||
|
let html = fs.readFileSync(htmlPath, "utf8");
|
||||||
|
const $ = load(html);
|
||||||
|
const site = new URL(this.base_url);
|
||||||
|
const siteHost = site.hostname.replace(/^www\./, "");
|
||||||
|
const baseDir = path.dirname(htmlPath);
|
||||||
|
|
||||||
|
const downloadTasks = [];
|
||||||
|
|
||||||
|
// ----------- ASSETS -----------
|
||||||
|
$(
|
||||||
|
"img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
|
||||||
|
).each((_, el) => {
|
||||||
|
const attr = el.tagName === "link" ? "href" : "src";
|
||||||
|
const val = $(el).attr(attr);
|
||||||
|
if (!val) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const abs = new URL(val, pageUrl).toString();
|
||||||
|
const u = new URL(abs);
|
||||||
|
const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
|
||||||
|
|
||||||
|
if (isInternal || this.download_external_assets) {
|
||||||
|
const file_id = u.pathname;
|
||||||
|
const paths = this._determine_paths(abs, file_id);
|
||||||
|
if (!paths) return;
|
||||||
|
const { dir_path, file_path } = paths;
|
||||||
|
|
||||||
|
if (this.rewrite_links) {
|
||||||
|
const normPath = u.pathname + (u.hash || "");
|
||||||
|
const localTarget = ensureLocalTargetForPath(normPath);
|
||||||
|
const localAbsPath = path.join(backupRoot, localTarget);
|
||||||
|
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fs.existsSync(file_path)) {
|
||||||
|
downloadTasks.push(
|
||||||
|
this._download_asset(abs, pageTimestamp, file_path, dir_path)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ----------- INTERNAL LINKS (pages/forms) -----------
|
||||||
|
if (this.rewrite_links) {
|
||||||
|
$("a[href], form[action]").each((_, el) => {
|
||||||
|
const attr = el.tagName === "a" ? "href" : "action";
|
||||||
|
const val = $(el).attr(attr);
|
||||||
|
if (!val) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const abs = new URL(val, pageUrl).toString();
|
||||||
|
const u = new URL(abs);
|
||||||
|
const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
|
||||||
|
|
||||||
|
if (isInternal) {
|
||||||
|
const normPath = u.pathname + (u.hash || "");
|
||||||
|
const localTarget = ensureLocalTargetForPath(normPath);
|
||||||
|
const localAbsPath = path.join(backupRoot, localTarget);
|
||||||
|
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await Promise.all(downloadTasks);
|
||||||
|
|
||||||
|
if (this.canonical_action === "remove") {
|
||||||
|
$("link[rel=\"canonical\"]").remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
fs.writeFileSync(htmlPath, $.html(), "utf8");
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`HTML processing error: ${e}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async _download_single(file_remote_info, total) {
|
||||||
|
const file_url = String(file_remote_info.file_url);
|
||||||
|
const file_id = file_remote_info.file_id;
|
||||||
|
const file_timestamp = file_remote_info.timestamp;
|
||||||
|
const paths = this._determine_paths(file_url, file_id);
|
||||||
|
if (!paths) {
|
||||||
|
console.log(`Skipping invalid URL: ${file_url}`);
|
||||||
|
this._processed++;
|
||||||
|
renderProgress(this._processed, total);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const { dir_path, file_path } = paths;
|
||||||
|
|
||||||
|
if (fs.existsSync(file_path)) {
|
||||||
|
this._processed++;
|
||||||
|
renderProgress(this._processed, total);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this._structure_dir_path(dir_path);
|
||||||
|
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
|
||||||
|
let res;
|
||||||
|
try {
|
||||||
|
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Skipping ${file_url}, fetch failed: ${e}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!res.ok || !res.body) {
|
||||||
|
console.log(`Skipping ${file_url}, bad response ${res.status}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
const ws = fs.createWriteStream(file_path);
|
||||||
|
Readable.fromWeb(res.body).pipe(ws);
|
||||||
|
ws.on("finish", resolve);
|
||||||
|
ws.on("error", reject);
|
||||||
|
});
|
||||||
|
|
||||||
|
const contentType = res.headers.get("content-type");
|
||||||
|
const ext = path.extname(file_path).toLowerCase();
|
||||||
|
const looksHtml =
|
||||||
|
isHtmlFile(file_path, contentType, null) ||
|
||||||
|
ext === "" ||
|
||||||
|
ext === ".html" ||
|
||||||
|
ext === ".htm";
|
||||||
|
if (looksHtml) {
|
||||||
|
await this._process_html_assets(file_path, file_url, file_timestamp);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Download failed for ${file_url}: ${e}`);
|
||||||
|
} finally {
|
||||||
|
this._processed++;
|
||||||
|
renderProgress(this._processed, total);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async download_files() {
|
||||||
|
const startTime = Date.now();
|
||||||
|
console.log(
|
||||||
|
`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
|
||||||
|
);
|
||||||
|
const list = await this.get_file_list_by_timestamp();
|
||||||
|
if (list.length === 0) {
|
||||||
|
console.log("No files to download.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const concurrency =
|
||||||
|
this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
|
||||||
|
const limit = pLimit(concurrency);
|
||||||
|
this._processed = 0;
|
||||||
|
await Promise.all(
|
||||||
|
list.map((info) => limit(() => this._download_single(info, list.length)))
|
||||||
|
);
|
||||||
|
const endTime = Date.now();
|
||||||
|
console.log(
|
||||||
|
`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
|
||||||
|
2
|
||||||
|
)}s, saved in ${this.backup_path()} (${list.length} files)`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================= INTERACTIVE RUN =============================
|
||||||
|
function ask(rl, question) {
|
||||||
|
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function interactiveMain() {
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: process.stdin,
|
||||||
|
output: process.stdout,
|
||||||
|
});
|
||||||
|
|
||||||
|
let base_url;
|
||||||
|
while (true) {
|
||||||
|
base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): ");
|
||||||
|
if (!base_url) continue;
|
||||||
|
try {
|
||||||
|
new URL(base_url);
|
||||||
|
break;
|
||||||
|
} catch {
|
||||||
|
console.log("Please enter a valid URL.\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||||
|
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||||
|
|
||||||
|
let rewrite_mode = "as-is";
|
||||||
|
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
|
||||||
|
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
|
||||||
|
|
||||||
|
let canonical_action = "keep";
|
||||||
|
if (rewrite_mode === "relative") {
|
||||||
|
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
|
||||||
|
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
|
||||||
|
}
|
||||||
|
|
||||||
|
let threads_count = await ask(rl, "How many download threads? (default 3): ");
|
||||||
|
threads_count = parseInt(threads_count || "3", 10);
|
||||||
|
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
|
||||||
|
|
||||||
|
const exact_url = /^y(es)?$/i.test(
|
||||||
|
await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
|
||||||
|
);
|
||||||
|
const directory = await ask(
|
||||||
|
rl,
|
||||||
|
"Target directory (leave blank for default websites/<host>/): "
|
||||||
|
);
|
||||||
|
|
||||||
|
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
|
||||||
|
const download_external_assets = /^y(es)?$/i.test(ext);
|
||||||
|
|
||||||
|
rl.close();
|
||||||
|
|
||||||
|
const dl = new WaybackMachineDownloader({
|
||||||
|
base_url,
|
||||||
|
exact_url,
|
||||||
|
directory: directory || null,
|
||||||
|
from_timestamp: from_timestamp || 0,
|
||||||
|
to_timestamp: to_timestamp || 0,
|
||||||
|
threads_count,
|
||||||
|
rewrite_mode,
|
||||||
|
canonical_action,
|
||||||
|
download_external_assets,
|
||||||
|
});
|
||||||
|
|
||||||
|
await dl.download_files();
|
||||||
|
}
|
||||||
|
|
||||||
|
const isDirectRun =
|
||||||
|
import.meta.url === `file://${process.argv[1]}` ||
|
||||||
|
import.meta.url === pathToFileURL(process.argv[1]).href;
|
||||||
|
|
||||||
|
if (isDirectRun) {
|
||||||
|
interactiveMain().catch((err) => {
|
||||||
|
console.error(`FATAL: ${err?.stack || err}`);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export { WaybackMachineDownloader };
|
||||||
35
wayback-machine-downloader/package.json
Normal file
35
wayback-machine-downloader/package.json
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
{
|
||||||
|
"name": "wayback-downloader",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"description": "Interactive Wayback Machine downloader for archiving websites locally.",
|
||||||
|
"type": "module",
|
||||||
|
"main": "downloader.js",
|
||||||
|
"bin": {
|
||||||
|
"wayback-downloader": "downloader.js"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"start": "node downloader.js"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"cheerio": "^1.0.0-rc.12",
|
||||||
|
"p-limit": "^4.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
},
|
||||||
|
"keywords": [
|
||||||
|
"wayback-machine-downloader",
|
||||||
|
"web-archive-downloder",
|
||||||
|
"archiver"
|
||||||
|
],
|
||||||
|
"author": "birbwatcher",
|
||||||
|
"license": "MIT",
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/birbwatcher/wayback-downloader.git"
|
||||||
|
},
|
||||||
|
"bugs": {
|
||||||
|
"url": "https://github.com/birbwatcher/wayback-downloader/issues"
|
||||||
|
},
|
||||||
|
"homepage": "https://github.com/birbwatcher/wayback-downloader#readme"
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user