前言
Leko 因為有些簡單的網頁解析想做,懶得自己 host,所以決定用 Cloudflare Worker。
豈料 CF Worker 裡面居然沒有 DOMParser 可以用 (官方: 太慢了),只有他們自己出的 HTMLRewriter 可以用 (用於取代網頁的部分)。用於解析 HTML 實在是有點難用。
因此 Leko 把它的 HTMLRewriter 打包成可以解析 HTML 的版本。
Code
import {decode} from 'html-entities';
class ElementHandler {
constructor(cb, attr) {
this._cb = cb;
this._attr = attr;
}
element(e) {
if (this._attr) {
this._cb(e.getAttribute(this._attr))
} else {
this._cb()
}
}
}
class DocumentHandler {
constructor(onEnd) {
this._onEnd = onEnd;
this._onText = () => { };
this._buffer = ""
this._doOnText = false;
}
end(end) {
this._onEnd();
}
text(text) {
if (this._doOnText) {
this._buffer += text.text;
if (text.lastInTextNode) {
const result = decode(this._buffer);
this._buffer = "";
this._doOnText = false;
this._onText(result);
}
}
}
set onText(cb) {
this._doOnText = true;
this._onText = cb;
}
}
function TextHandlerFactory(cb, dh) {
return new ElementHandler(() => {
dh.onText = cb;
})
}
function HTMLParser(resp, tracks) {
return new Promise(async (resolve, reject) => {
const result = {};
const dh = new DocumentHandler(() => resolve(result));
let newResp = new HTMLRewriter();
for (let [k, v] of Object.entries(tracks)) {
if (!Array.isArray(v)) v = [v];
const handler = (v.length > 1 && v[1])
? new ElementHandler(x => result[k] = x, v[1])
: TextHandlerFactory(x => result[k] = x, dh)
newResp = newResp.on(v[0], handler)
}
newResp = newResp.onDocument(dh).transform(resp);
await newResp.blob()
})
}
//-------- Usage --------//
async function crawl() {
const response = await fetch(URL);
const track = {
text: "#bodyText", // get innerText of #bodyText
img: ["#avatar img", "src"], // get attribute src of #avatar img
text2: ["#bodyText2"], // this also works for getting innerText
text3: ["#bodyText3", null] // this also works too.
}
const data = await HTMLParser(response, track);
return data; // {text, img, text2, text3} with coresponding values.
}