/* 将 html 解析为适用于小程序 rich-text 的 DOM 结构 github地址:https://github.com/jin-yufeng/Parser 文档地址:https://jin-yufeng.github.io/Parser author:JinYufeng */ const CssHandler = require("./CssHandler.js"); const config = require("./config.js"); const blankChar = config.blankChar; var emoji; // 需要使用 emoji 补丁包时将此行改为 const emoji = require("./emoji.js"); class MpHtmlParser { constructor(data, options = {}) { this.CssHandler = new CssHandler(options.tagStyle); this.data = data; this.DOM = []; // #ifdef MP-BAIDU || MP-TOUTIAO this._imgMode = options.imgMode; // #endif this._attrName = ''; this._attrValue = ''; this._attrs = {}; this._domain = options.domain; this._protocol = options.domain && options.domain.includes("://") ? this._domain.split("://")[0] : "http"; this._i = 0; this._sectionStart = 0; this._state = this.Text; this._STACK = []; this._tagName = ''; this._audioNum = 0; this._imgNum = 0; this._videoNum = 0; this._useAnchor = options.useAnchor; this._whiteSpace = false; }; parse() { if (this.CssHandler) this.data = this.CssHandler.getStyle(this.data); if (emoji) this.data = emoji.parseEmoji(this.data); // 高亮处理 if (config.highlight) this.data = this.data.replace(/<[pP][rR][eE]([\s\S]*?)>([\s\S]*?)<\/[pP][rR][eE][\s\S]*?>/g, function($, $1, $2) { return "' + config.highlight($2, "') + ""; }) for (var len = this.data.length; this._i < len; this._i++) this._state(this.data[this._i]); if (this._state == this.Text) this.setText(); while (this._STACK.length) this.popNode(this._STACK.pop()); // #ifdef MP-BAIDU || MP-TOUTIAO const inlineTags = config.makeMap("abbr,b,big,code,del,em,font,i,ins,label,mark,q,s,small,span,strong,sub,sup,u") // 将顶层标签的一些样式提取出来给 rich-text function setContain(nodes) { for (var i = nodes.length, element; element = nodes[--i];) { if (element.type == "text") continue; if (!element.c) { var tmp, res = "", style = element.attrs.style; if (style) { style = style.toLowerCase(); if (style.indexOf("float") != -1) res += style.match(/float[^;]+(?![\s\S]*?float)/)[0]; if (style.indexOf("margin") != -1) element.attrs.style = style.replace(/margin[^;]+/g, function() { res += (';' + arguments[0]); if (arguments[0].includes("margin-top")) return "margin-top:0"; if (arguments[0].includes("margin-bottom")) return "margin-bottom:0"; if (!arguments[0].includes("margin-left") && !arguments[0].includes("margin-right")) return "margin:0"; return ''; }); if (style.indexOf("display") != -1 && (tmp = style.match(/display\s*:\s*([^;]*)(?![\s\S]*?display)/), tmp[1].indexOf("flex") == -1)) res += (';' + tmp[0]); else if (inlineTags[element.name]) res += ";display:inline"; else res += (";display:" + (element.name == 'img' ? 'inline-block' : 'block')); tmp = style.match(/flex[^;]*:[^;]+/g); if (tmp) res += (';' + tmp.join(';')); if (style.indexOf("width") != -1) element.attrs.style = style.replace(/width[^;]*?%/g, function() { res += (';' + arguments[0]); return "width:100%" }) } else if (inlineTags[element.name]) res = "display:inline"; else res = ("display:" + (element.name == 'img' ? 'inline-block' : 'block')); element.attrs.containStyle = res; } else setContain(element.children); } }; setContain(this.DOM); // #endif if (this.DOM.length) this.DOM[0].PoweredBy = "Parser"; return this.DOM; }; // 设置属性 setAttr() { if (config.trustAttrs[this._attrName]) { if (this._attrName == "src" && this._attrValue[0] == '/') { if (this._attrValue[1] == '/') this._attrValue = this._protocol + ':' + this._attrValue; else if (this._domain) this._attrValue = this._domain + this._attrValue; } this._attrs[this._attrName] = (this._attrValue ? this._attrValue : (this._attrName == "src" ? "" : "true")); } this._attrValue = ''; while (blankChar[this.data[this._i]]) this._i++; if (this.checkClose()) this.setNode(); else this._state = this.AttrName; }; // 设置文本节点 setText() { var text = this.getSelection(); if (!text) return; if (!this._whiteSpace) { // 移除空白符 for (var tmp = [], i = text.length, has = false, c; c = text[--i];) if ((!blankChar[c] && (has = true)) || (!blankChar[tmp[0]] && (c = ' '))) tmp.unshift(c); if (!has) return; text = tmp.join(''); } // 处理实体 // #ifdef MP-BAIDU || MP-ALIPAY || MP-TOUTIAO var entities = { lt: "<", gt: ">", amp: "&", quot: '"', apos: "'", nbsp: "\u00A0", ensp: "\u2002", emsp: "\u2003", ndash: "–", mdash: "—", middot: "·", lsquo: "‘", rsquo: "’", ldquo: "“", rdquo: "”", bull: "•", hellip: "…", permil: "‰", copy: "©", reg: "®", trade: "™", times: "×", divide: "÷", cent: "¢", pound: "£", yen: "¥", euro: "€", sect: "§" }; // #endif var i = text.indexOf('&'), j, u, decode; while (i != -1) { j = text.indexOf(';', i + 2); if (j == -1) break; if (text[i + 1] == '#') { u = parseInt((text[i + 2] == 'x' ? '0' : '') + text.substring(i + 2, j)); if (!isNaN(u)) text = text.substring(0, i) + String.fromCharCode(u) + text.substring(j + 1); } else { u = text.substring(i + 1, j); // #ifdef MP-WEIXIN || MP-QQ || APP-PLUS if (u == "nbsp") text = text.substring(0, i) + '\u00A0' + text.substring(j + 1); // 解决连续   失效的问题 else if (u != "lt" && u != "gt" && u != "amp" && u != "ensp" && u != "emsp") decode = true; // #endif // #ifdef MP-BAIDU || MP-ALIPAY || MP-TOUTIAO if (entities[u]) text = text.substring(0, i) + entities[u] + text.substring(j + 1); // #endif } i = text.indexOf('&', i + 2); } var slibings = this._STACK.length ? this._STACK[this._STACK.length - 1].children : this.DOM; if (slibings.length && slibings[slibings.length - 1].type == "text") { slibings[slibings.length - 1].text += text; if (decode) slibings[slibings.length - 1].decode = true; } else slibings.push({ type: "text", text, decode }) }; // 设置元素节点 setNode() { var slibings = this._STACK.length ? this._STACK[this._STACK.length - 1].children : this.DOM; var node = { name: this._tagName.toLowerCase(), attrs: this._attrs } config.LabelAttrsHandler(node, this); this._attrs = {}; if (this.data[this._i] == '>') { if (!config.selfClosingTags[this._tagName]) { if (config.ignoreTags[node.name]) { var j = this._i; // 处理要被移除的标签 while (this._i < this.data.length) { (this._i = this.data.indexOf("' && this.data[this._i] != '/') this._i++; if (this.data.substring(this._sectionStart, this._i).toLowerCase() == node.name) { this._i = this.data.indexOf('>', this._i); if (this._i == -1) this._i = this.data.length; else this._sectionStart = this._i + 1; this._state = this.Text; // 处理svg if (node.name == "svg") { var src = this.data.substring(j, this._i + 1); if (!node.attrs.xmlns) src = " xmlns=\"http://www.w3.org/2000/svg\"" + src; this._i = j; while (this.data[j] != '<') j--; src = this.data.substring(j, this._i) + src; this._i = this._sectionStart - 1; node.name = "img"; node.attrs = { src: "data:image/svg+xml;utf8," + src.replace(/#/g, "%23"), ignore: "true" } slibings.push(node); } break; } } return; } else this._STACK.push(node); node.children = []; } } else this._i++; this._sectionStart = this._i + 1; this._state = this.Text; if (!config.ignoreTags[node.name]) { // 检查空白符是否有效 if (node.name == "pre" || (node.attrs.style && node.attrs.style.toLowerCase().includes("white-space") && node.attrs .style.toLowerCase().includes("pre"))) { this._whiteSpace = true; node.pre = true; } slibings.push(node); } }; // 标签出栈处理 popNode(node) { // 替换一些标签名 if (config.blockTags[node.name]) node.name = 'div'; else if (!config.trustTags[node.name]) node.name = 'span'; // 空白符处理 if (node.pre) { this._whiteSpace = false; node.pre = undefined; for (var i = this._STACK.length; i--;) if (this._STACK[i].pre) this._whiteSpace = true; } // 处理列表 if (node.c) { if (node.name == "ul") { var floor = 1; for (var i = this._STACK.length; i--;) if (this._STACK[i].name == "ul") floor++; if (floor != 1) for (i = node.children.length; i--;) node.children[i].floor = floor; } else if (node.name == "ol") { function convert(num, type) { if (type == 'a') return String.fromCharCode(97 + (num - 1) % 26); if (type == 'A') return String.fromCharCode(65 + (num - 1) % 26); if (type == 'i' || type == 'I') { num = (num - 1) % 99 + 1; var one = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'], ten = ['X', 'XX', 'XXX', 'XL', 'L', 'LX', 'LXX', 'LXXX', 'XC'], res = (ten[Math.floor(num / 10) - 1] || '') + (one[num % 10 - 1] || ''); if (type == 'i') return res.toLowerCase(); return res; } return num; } for (var i = 0, num = 1, child; child = node.children[i++];) if (child.name == "li") { child.type = "ol"; child.num = convert(num++, node.attrs.type) + '.'; } } } // 处理表格的边框 if (node.name == 'table') { if (node.attrs.border) node.attrs.style = "border:" + node.attrs.border + "px solid gray;" + (node.attrs.style || ''); if (node.attrs.hasOwnProperty("cellspacing")) node.attrs.style = "border-spacing:" + node.attrs.cellspacing + "px;" + (node.attrs.style || ''); function setBorder(elem) { if (elem.name == 'th' || elem.name == 'td') { if (node.attrs.border) elem.attrs.style = "border:" + node.attrs.border + "px solid gray;" + (elem.attrs.style || ''); if (node.attrs.hasOwnProperty("cellpadding")) elem.attrs.style = "padding:" + node.attrs.cellpadding + "px;" + (elem.attrs.style || ''); return; } if (elem.type == 'text') return; for (var i = 0; i < (elem.children || []).length; i++) setBorder(elem.children[i]); } if (node.attrs.border || node.attrs.hasOwnProperty("cellpadding")) for (var i = 0; i < node.children.length; i++) setBorder(node.children[i]); } // 合并一些不必要的层,减小节点深度 if (node.children.length == 1 && node.name == "div" && node.children[0].name == "div") { var child = node.children[0].attrs; node.attrs.style = node.attrs.style || ''; child.style = child.style || ''; if (node.attrs.style.includes("padding") && (node.attrs.style.includes("margin") || child.style.includes( "margin")) && node.attrs.style.includes("display") && child.style.includes("display") && !(node.attrs.id && node.attrs.id) && !(node.attrs.class && child.class)) { if (child.style.includes("padding")) child.style = "box-sizing:border-box;" + child.style; node.attrs.style = node.attrs.style + ";" + child.style; node.attrs.id = (child.id || '') + (node.attrs.id || ''); node.attrs.class = (child.class || '') + (node.attrs.class || ''); node.children = child.children; } else if (!node.attrs.style) node.attrs.style = undefined; else if (!child.style) child.style = undefined; child = undefined; } // 多层样式处理 if (this.CssHandler.pop) this.CssHandler.pop(node); }; // 工具函数 checkClose() { if (this.data[this._i] == '>' || (this.data[this._i] == '/' && this.data[this._i + 1] == '>')) return true; return false; }; getSelection(trim) { var str = (this._sectionStart == this._i ? '' : this.data.substring(this._sectionStart, this._i)); while (trim && (blankChar[this.data[++this._i]] || (this._i--, false))); this._sectionStart = this._i + 1; return str; }; // 状态机 Text(c) { if (c == '<') { var next = this.data[this._i + 1]; if ((next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z')) { this.setText(); this._state = this.TagName; } else if (next == '/') { this.setText(); this._i++; next = this.data[this._i + 1]; if ((next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z')) { this._sectionStart = this._i + 1; this._state = this.EndTag; } else this._state = this.Comment; } else if (next == '!') { this.setText(); this._state = this.Comment; } } }; Comment() { if (this.data.substring(this._i + 1, this._i + 3) == "--" || this.data.substring(this._i + 1, this._i + 7) == "[CDATA[") { this._i = this.data.indexOf("-->", this._i + 1); if (this._i == -1) return this._i = this.data.length; else this._i = this._i + 2; } else (this._i = this.data.indexOf(">", this._i + 1)) == -1 ? this._i = this.data.length : null; this._sectionStart = this._i + 1; this._state = this.Text; }; TagName(c) { if (blankChar[c]) { this._tagName = this.getSelection(true); if (this.checkClose()) this.setNode(); else this._state = this.AttrName; } else if (this.checkClose()) { this._tagName = this.getSelection(); this.setNode(); } }; AttrName(c) { if (blankChar[c]) { this._attrName = this.getSelection(true).toLowerCase(); if (this.data[this._i] == '=') { while (blankChar[this.data[++this._i]]); this._sectionStart = this._i--; this._state = this.AttrValue; } else this.setAttr(); } else if (c == '=') { this._attrName = this.getSelection().toLowerCase(); while (blankChar[this.data[++this._i]]); this._sectionStart = this._i--; this._state = this.AttrValue; } else if (this.checkClose()) { this._attrName = this.getSelection().toLowerCase(); this.setAttr(); } }; AttrValue(c) { if (c == '"' || c == "'") { this._sectionStart++; if ((this._i = this.data.indexOf(c, this._i + 1)) == -1) return this._i = this.data.length; } else for (; !blankChar[this.data[this._i]] && this.data[this._i] != '>'; this._i++); this._attrValue = this.getSelection(); while (this._attrValue.includes(""")) this._attrValue = this._attrValue.replace(""", ''); this.setAttr(); }; EndTag(c) { if (blankChar[c] || c == '>' || c == '/') { var name = this.getSelection().toLowerCase(); var flag = false; for (var i = this._STACK.length; i--;) if (this._STACK[i].name == name) { flag = true; break; } if (flag) { var node; while (flag) { node = this._STACK.pop(); if (node.name == name) flag = false; this.popNode(node); } } else if (name == 'p' || name == "br") { var slibings = this._STACK.length ? this._STACK[this._STACK.length - 1].children : this.DOM; slibings.push({ name, attrs: {} }); } this._i = this.data.indexOf('>', this._i); if (this._i == -1) this._i = this.data.length; else this._state = this.Text; } }; }; module.exports = (data, options) => new MpHtmlParser(data, options).parse();