Cloudflare Worker 的原生 HTML Parser

前言

Leko 因為有些簡單的網頁解析想做,懶得自己 host,所以決定用 Cloudflare Worker。

豈料 CF Worker 裡面居然沒有 DOMParser 可以用 (官方: 太慢了),只有他們自己出的 HTMLRewriter 可以用 (用於取代網頁的部分)。用於解析 HTML 實在是有點難用。

因此 Leko 把它的 HTMLRewriter 打包成可以解析 HTML 的版本。

Code

import {decode} from 'html-entities';

class ElementHandler {
    constructor(cb, attr) {
        this._cb = cb;
        this._attr = attr;
    }

    element(e) {
        if (this._attr) {
            this._cb(e.getAttribute(this._attr))
        } else {
            this._cb()
        }
    }
}

class DocumentHandler {
    constructor(onEnd) {
        this._onEnd = onEnd;
        this._onText = () => { };
        this._buffer = ""
        this._doOnText = false;
    }

    end(end) {
        this._onEnd();
    }

    text(text) {
        if (this._doOnText) {
            this._buffer += text.text;
            if (text.lastInTextNode) {
                const result = decode(this._buffer);

                this._buffer = "";
                this._doOnText = false;

                this._onText(result);
            }
        }
    }

    set onText(cb) {
        this._doOnText = true;
        this._onText = cb;
    }
}

function TextHandlerFactory(cb, dh) {
    return new ElementHandler(() => {
        dh.onText = cb;
    })
}

function HTMLParser(resp, tracks) {
    return new Promise(async (resolve, reject) => {
        const result = {};
        const dh = new DocumentHandler(() => resolve(result));
        let newResp = new HTMLRewriter();

        for (let [k, v] of Object.entries(tracks)) {
            if (!Array.isArray(v)) v = [v];
            const handler = (v.length > 1 && v[1])
                ? new ElementHandler(x => result[k] = x, v[1])
                : TextHandlerFactory(x => result[k] = x, dh)
            newResp = newResp.on(v[0], handler)
        }
        newResp = newResp.onDocument(dh).transform(resp);

        await newResp.blob()
    })
}

//-------- Usage --------//
async function crawl() {
    const response = await fetch(URL);

    const track = {
        text: "#bodyText", // get innerText of #bodyText
        img: ["#avatar img", "src"], // get attribute src of #avatar img
        text2: ["#bodyText2"], // this also works for getting innerText
        text3: ["#bodyText3", null] // this also works too.
    }

    const data = await HTMLParser(response, track);

    return data; // {text, img, text2, text3} with coresponding values.
}
按讚

發佈留言

電子郵件地址不會被公開。必填項已用 * 標註