60 lines
1.7 KiB
Java
60 lines
1.7 KiB
Java
|
package ink.wgink.util;
|
||
|
|
||
|
import java.util.regex.Matcher;
|
||
|
import java.util.regex.Pattern;
|
||
|
|
||
|
public class HtmlHelper {
|
||
|
|
||
|
/**
|
||
|
* 去除script
|
||
|
*/
|
||
|
private static final Pattern P_SCRIPT = Pattern.compile("<script[^>]*?>[\\s\\S]*?<\\/script>",
|
||
|
Pattern.CASE_INSENSITIVE);
|
||
|
/**
|
||
|
* 去除style的正则表达式
|
||
|
*/
|
||
|
private static final Pattern P_STYLE = Pattern.compile("<style[^>]*?>[\\s\\S]*?<\\/style>",
|
||
|
Pattern.CASE_INSENSITIVE);
|
||
|
/**
|
||
|
* 定义HTML标签的正则表达式
|
||
|
*/
|
||
|
private static final Pattern P_HTML = Pattern.compile("<[^>]+>", Pattern.CASE_INSENSITIVE);
|
||
|
/**
|
||
|
* 定义空格回车换行符
|
||
|
*/
|
||
|
private static final Pattern P_SPACE = Pattern.compile("\\s*|\t|\r|\n",
|
||
|
Pattern.CASE_INSENSITIVE);
|
||
|
|
||
|
/**
|
||
|
* <p>
|
||
|
* title HtmlHelper
|
||
|
* </p>
|
||
|
* <p>
|
||
|
* description 获取纯文本
|
||
|
* </p>
|
||
|
*
|
||
|
* @param html
|
||
|
* @return
|
||
|
* @author WenG
|
||
|
* @date 2018年6月14日 下午2:31:32
|
||
|
* @modifier WenG
|
||
|
* @date 2018年6月14日 下午2:31:32
|
||
|
*/
|
||
|
public static String getText(String html) {
|
||
|
// 过滤script标签
|
||
|
Matcher mScript = P_SCRIPT.matcher(html);
|
||
|
html = mScript.replaceAll("");
|
||
|
// 过滤style标签
|
||
|
Matcher mStyle = P_STYLE.matcher(html);
|
||
|
html = mStyle.replaceAll("");
|
||
|
// 过滤html标签
|
||
|
Matcher mHtml = P_HTML.matcher(html);
|
||
|
html = mHtml.replaceAll("");
|
||
|
// 过滤空格回车标签
|
||
|
Matcher mSpace = P_SPACE.matcher(html);
|
||
|
html = mSpace.replaceAll("");
|
||
|
return html;
|
||
|
}
|
||
|
|
||
|
}
|