package ink.wgink.util; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HtmlHelper { /** * 去除script */ private static final Pattern P_SCRIPT = Pattern.compile("]*?>[\\s\\S]*?<\\/script>", Pattern.CASE_INSENSITIVE); /** * 去除style的正则表达式 */ private static final Pattern P_STYLE = Pattern.compile("]*?>[\\s\\S]*?<\\/style>", Pattern.CASE_INSENSITIVE); /** * 定义HTML标签的正则表达式 */ private static final Pattern P_HTML = Pattern.compile("<[^>]+>", Pattern.CASE_INSENSITIVE); /** * 定义空格回车换行符 */ private static final Pattern P_SPACE = Pattern.compile("\\s*|\t|\r|\n", Pattern.CASE_INSENSITIVE); /** *

* title HtmlHelper *

*

* description 获取纯文本 *

* * @param html * @return * @author WenG * @date 2018年6月14日 下午2:31:32 * @modifier WenG * @date 2018年6月14日 下午2:31:32 */ public static String getText(String html) { // 过滤script标签 Matcher mScript = P_SCRIPT.matcher(html); html = mScript.replaceAll(""); // 过滤style标签 Matcher mStyle = P_STYLE.matcher(html); html = mStyle.replaceAll(""); // 过滤html标签 Matcher mHtml = P_HTML.matcher(html); html = mHtml.replaceAll(""); // 过滤空格回车标签 Matcher mSpace = P_SPACE.matcher(html); html = mSpace.replaceAll(""); return html; } /** * 格式化HTML代码 * * @param code html代码 * @param indentType 缩进类型,默认是空格 * @param indentCount 缩进数量 * @return */ public static String formatHtml(String code, String indentType, int indentCount) { if (StringUtils.isBlank(code)) { return null; } code = code.replaceAll(">\\s+<", "><"); String html = Jsoup.parseBodyFragment(code).body().html(); if (StringUtils.isBlank(html)) { return null; } if (indentType == null) { indentType = " "; } if (indentCount < 0) { indentCount = 0; } String[] htmlArray = html.split("\n"); String result = ""; for (String htmlLine : htmlArray) { int startIndex = htmlLine.indexOf("<"); String tab = ""; for (int i = 0; i < startIndex + indentCount; i++) { tab += indentType; } htmlLine = tab + htmlLine.trim(); System.out.println(htmlLine); result += htmlLine + "\n"; } return result; } public static void main(String[] args) { String code = "
"; String result = formatHtml(code, "\t", 0); System.out.println(result); } }