1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
| import {decode} from 'html-entities';
class ElementHandler { constructor(cb, attr) { this._cb = cb; this._attr = attr; }
element(e) { if (this._attr) { this._cb(e.getAttribute(this._attr)) } else { this._cb() } } }
class DocumentHandler { constructor(onEnd) { this._onEnd = onEnd; this._onText = () => { }; this._buffer = "" this._doOnText = false; }
end(end) { this._onEnd(); }
text(text) { if (this._doOnText) { this._buffer += text.text; if (text.lastInTextNode) { const result = decode(this._buffer);
this._buffer = ""; this._doOnText = false;
this._onText(result); } } }
set onText(cb) { this._doOnText = true; this._onText = cb; } }
function TextHandlerFactory(cb, dh) { return new ElementHandler(() => { dh.onText = cb; }) }
function HTMLParser(resp, tracks) { return new Promise(async (resolve, reject) => { const result = {}; const dh = new DocumentHandler(() => resolve(result)); let newResp = new HTMLRewriter();
for (let [k, v] of Object.entries(tracks)) { if (!Array.isArray(v)) v = [v]; const handler = (v.length > 1 && v[1]) ? new ElementHandler(x => result[k] = x, v[1]) : TextHandlerFactory(x => result[k] = x, dh) newResp = newResp.on(v[0], handler) } newResp = newResp.onDocument(dh).transform(resp);
await newResp.blob() }) }
async function crawl() { const response = await fetch(URL);
const track = { text: "#bodyText", img: ["#avatar img", "src"], text2: ["#bodyText2"], text3: ["#bodyText3", null] }
const data = await HTMLParser(response, track);
return data; }
|