前言

Leko 因為有些簡單的網頁解析想做,懶得自己 host,所以決定用 Cloudflare Worker。

豈料 CF Worker 裡面居然沒有 DOMParser 可以用 (官方: 太慢了),只有他們自己出的 HTMLRewriter 可以用 (用於取代網頁的部分)。用於解析 HTML 實在是有點難用。

因此 Leko 把它的 HTMLRewriter 打包成可以解析 HTML 的版本。

Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import {decode} from 'html-entities';

class ElementHandler {
constructor(cb, attr) {
this._cb = cb;
this._attr = attr;
}

element(e) {
if (this._attr) {
this._cb(e.getAttribute(this._attr))
} else {
this._cb()
}
}
}

class DocumentHandler {
constructor(onEnd) {
this._onEnd = onEnd;
this._onText = () => { };
this._buffer = ""
this._doOnText = false;
}

end(end) {
this._onEnd();
}

text(text) {
if (this._doOnText) {
this._buffer += text.text;
if (text.lastInTextNode) {
const result = decode(this._buffer);

this._buffer = "";
this._doOnText = false;

this._onText(result);
}
}
}

set onText(cb) {
this._doOnText = true;
this._onText = cb;
}
}

function TextHandlerFactory(cb, dh) {
return new ElementHandler(() => {
dh.onText = cb;
})
}

function HTMLParser(resp, tracks) {
return new Promise(async (resolve, reject) => {
const result = {};
const dh = new DocumentHandler(() => resolve(result));
let newResp = new HTMLRewriter();

for (let [k, v] of Object.entries(tracks)) {
if (!Array.isArray(v)) v = [v];
const handler = (v.length > 1 && v[1])
? new ElementHandler(x => result[k] = x, v[1])
: TextHandlerFactory(x => result[k] = x, dh)
newResp = newResp.on(v[0], handler)
}
newResp = newResp.onDocument(dh).transform(resp);

await newResp.blob()
})
}

//-------- Usage --------//
async function crawl() {
const response = await fetch(URL);

const track = {
text: "#bodyText", // get innerText of #bodyText
img: ["#avatar img", "src"], // get attribute src of #avatar img
text2: ["#bodyText2"], // this also works for getting innerText
text3: ["#bodyText3", null] // this also works too.
}

const data = await HTMLParser(response, track);

return data; // {text, img, text2, text3} with coresponding values.
}