##HtmlCleaner使用
<div><br></div><div>下面展示一段在工作当中总结的关于Htmlcleaner的工具类</div><div><br></div><div>htmlcleaner个人觉的非常好用,是我写爬虫的时候经常用到的最多的一种方式,一般跟xpath结合,关于xpath的内容,查看xpath相关内容。</div><div><br></div><div><hr><br></div><div>
~~~
下面展示一段在工作当中总结的关于Htmlcleaner的工具类
htmlcleaner个人觉的非常好用,是我写爬虫的时候经常用到的最多的一种方式,一般跟xpath结合,关于xpath的内容,查看xpath相关内容。
public class HtmlCleanerUtil {
public static String simpleFormat = "yyyy-MM-dd HH:mm:ss";
private static Logger logger = Logger.getLogger(HtmlCleanerUtil.class);
private static HtmlCleaner cleaner=null;
private static HtmlCleanerUtil instance = new HtmlCleanerUtil();
private Map<String ,String> monthMap = new HashMap<String,String>();
private HtmlCleanerUtil(){
cleaner = new HtmlCleaner();
}
public static HtmlCleanerUtil getInstance(){
synchronized (instance) {
if(instance==null){
instance=new HtmlCleanerUtil();
}
}
return instance;
}
public Object[] parseObjectByTag(TagNode tag,String evluatePath){
Object[] res = null;
try {
res= tag.evaluateXPath(evluatePath);
} catch (Exception e) {
//System.out.println("******parser tag error***********");
logger.error("parseTagurl", e);//e.printStackTrace();
}
return res;
}
public TagNode[] parseTagNodeArrayByTag(TagNode tag,String evluatePath){
Object[] res = null;
List<TagNode> result = null;
try {
result = new ArrayList<TagNode>();
res= tag.evaluateXPath(evluatePath);
if(res!=null &&res.length>0){
for(Object o:res){
if(o instanceof TagNode){
result.add((TagNode)o);
}
}
}
} catch (Exception e) {
//System.out.println("******parser tag error***********");
logger.error("parseTagurl", e);//e.printStackTrace();
}
TagNode[] resa = new TagNode[result.size()];
return result.toArray(resa);
}
//get tag text
public String parserTextByTag(TagNode tag,String evluatePath){
String res = "";
try {
Object[] obj = tag.evaluateXPath(evluatePath);
if (obj.length > 0) {
StringBuffer sb = ((TagNode) obj[0]).getText();
if(sb!=null)
res=sb.toString();
}
} catch (Exception e) {
//System.out.println("******parser tag error***********");
logger.error("parseTagurl", e);//e.printStackTrace();
}
return res;
}
//get tag property value
public String parserAttrByTag(TagNode tag,String attrName,String attrValue,String attrProperty ) {
String res = "";
try {
Object[] obj = tag.getElementsByAttValue(attrName, attrValue,true, true);
if (obj.length > 0) {
res = ((TagNode) obj[0]).getAttributeByName(attrProperty);
}
} catch (Exception e) {
logger.error("parserAttrByTag", e);//e.printStackTrace();
}
return res;
}
//get tag property value
public String parserAttrByXpath(TagNode tag,String evluatePath,String attrProperty ) {
String res = "";
try {
Object[] obj = tag.evaluateXPath(evluatePath);
if (obj.length > 0) {
res = ((TagNode) obj[0]).getAttributeByName(attrProperty);
}
} catch (Exception e) {
logger.error("parserAttrByTag", e);//e.printStackTrace();
}
return res;
}
public TagNode getHtmlTag(String content){
TagNode tag = cleaner.clean(content);
return tag;
}
public String patternStr(String content,String regex){
if(content==null)
return "";
String res="";
Pattern p=Pattern.compile(regex);
Matcher m=p.matcher(content);
content = null;
if(m.find()){
if(m.group()!=null)
res=m.group(1);
}
return res;
}
public boolean isNumber(String str){
if(str==null ||str.trim().length()==0)
return false;
String regex="^[0-9]*$";
Pattern p=Pattern.compile(regex);
Matcher m=p.matcher(str);
if(m.find()){
return true;
}
return false;
}
}
使用案例 定义(vimeo.com)一个parser,针对每一个网站进行parse
public class VimeoParser extends SearchSite implements ListHandler, PageHandler {
private static Logger logger = Logger.getLogger(VimeoParser.class);
private String site = "vimeo.com";
DefaultHttpClient client = new DefaultHttpClient();
public static HeaderGroup headers = new HeaderGroup();
static {
headers.addHeader(new BasicHeader("Host", "vimeo.com")); // Cookie","searchtoken=2e09594c
}
@Override
public boolean parsePage(String content, MediaEntry me) {
me.site = site;
me.views = -1;
me.comments = -1;
me.favorites = -1;
TagNode tag = HtmlCleanerUtil.getInstance().getHtmlTag(content);
getTitle(tag, me);
getDesc(tag, me);
getUrl(tag, me);
getIcon(tag, me);
getViews(tag, me);
// System.out.println(me.views);
getAddDate(tag, me);
// System.out.println(me.adddate);
getDuration(tag, me);
getTag(tag, me);
getCatagory(tag, me);
content = null;// for GC
return XinUtil.parsePageReturn(me);
}
@Override
public ArrayList<MediaEntry> parseList(String content) {
return HtmlParserUtil.getInstance().parseLinkTag(site, content, "//div[@class='thumbnail_format']//div[@class='title']//a");
}
public String getHtmltoCookie(String url) {
String ret = "";
HttpGet httpget = new HttpGet(url);
httpget.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10");
httpget.addHeader("Host", "vimeo.com");
httpget.addHeader("Referer", url);
httpget.addHeader("Accept-Encoding", "gzip,deflate");
httpget.addHeader("Keep-Alive", "115");
httpget.addHeader("Connection", "Keep-Alive");
httpget.addHeader("Cookie", "searchtoken=2e09594c");
httpget.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
ResponseHandler<String> responseHandler = new BasicResponseHandler();
try {
ret = client.execute(httpget, responseHandler);
} catch (Exception e) {
e.printStackTrace();
}
return ret;
}
/**
* @param args
*/
public static void main(String[] args) {
boolean result = isNumber("35.3");
System.out.println(result);
// System.setProperty("http.proxyHost", "127.0.0.1");
// System.setProperty("http.proxyPort", "8087");
String pageUrl = "https://vimeo.com/106161632";// http://vimeo.com/44649795
// http://vimeo.com/channels/nipwitz/114951438
// http://vimeo.com/114951438
// https://vimeo.com/ondemand/highmaintenance
//https://vimeo.com/ondemand/5182
//http://vimeo.com/4749536
// String url = "https://player.vimeo.com/video/114951438";
String content = XinBaseUtil.getUrlString(pageUrl, "UTF-8", new String[][] {
{ "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0" } });
MediaEntry me = new MediaEntry();
System.out.println(content);
me.pageurl = pageUrl;
// MediaEntry me=new MediaEntry();
VimeoParser vimeo = new VimeoParser();
// String html = HtmlParserUtil.getHtml(pageUrl, null);
// String html=XinBaseUtil.getUrlString(pageUrl,"UTF-8", new
// String[][]{{"Host","Vimeo"},{"roxy-Connection","keep-alive"},{"User-Agent","Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/17.0 Firefox/17.0"}});
System.out.println(content.length());
System.out.println("result = "+vimeo.parsePage(content, me));
System.out.println(me);
// String
// listUrl="http://www.vimeo.com/search/videos/search:star/st/2e09594c";
// String listUrl=vimeo.getListUrl(0, "star", 2);
// System.out.println(html);
}
@Override
public String getListUrl(int type, String searchString, int page) {
if (page == 0)
page = 1;
String listurl = "http://vimeo.com/search/videos/search:" + searchString + "/st/2e09594c/page:" + page + "/sort:relevant/format:thumbnail";
return listurl;
}
private void getDesc(TagNode tag, MediaEntry me) {
String script = "//meta[@property='og:description']";
String desc = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, script, "content");
// String desc = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag,
// script, "content");
if (desc != null)
me.description = desc;
else
me.description = "";
}
private void getTitle(TagNode tag, MediaEntry me) {
// title
String title = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, "//meta[@property='og:title']", "content");
if (title != null && title.contains("vimeo")) {
title = title.replace("vimeo", "");
}
me.title = title;
}
private void getCatagory(TagNode tag, MediaEntry me) {
String cata = "";
// String regex = "//div[@class='info_row_cat']";
// cata = HtmlCleanerUtil.getInstance().parserTextByTag(tag, regex);
// cata = cata.replace("Categories:", "").replace(" ",
// "").replace("\r\n", "").replace("\n", "").replace("\r", "");
me.category = cata;
}
private void getTag(TagNode tag, MediaEntry me) {
ArrayList<String> tags = new ArrayList<String>();
// String regex = "//div[@class='info_row info_row_tag']/a";
// Object[] objArr = HtmlCleanerUtil.getInstance().parseObjectByTag(tag,
// regex);
// if (objArr != null && objArr.length > 0) {
// for (Object oo : objArr) {
// tags.add(((TagNode) oo).getText().toString());
// }
// }
me.tags = tags;
}
private void getDuration(TagNode tag, MediaEntry me) {
me.duration = -1;
}
private void getAddDate(TagNode tag, MediaEntry me) {
// duration
String adtnReg = "//time";
String adtStr = "";
adtStr = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, adtnReg, "datetime");
if (StringUtils.trimToNull(adtStr) == null) {
return;
}
adtStr = adtStr.replace(",", "").replace(" ", "").split("T")[0];
if (StringUtils.trimToNull(adtStr) != null) {
me.adddate = adtStr + " 00:00:00";
}
}
private void getViews(TagNode tag, MediaEntry me) {
// duration
String durationReg = "//span[@class='plays']";
String durationStr = "";
durationStr = HtmlCleanerUtil.getInstance().parserTextByTag(tag, durationReg);
if (StringUtils.trimToNull(durationStr) == null) {
return;
}
durationStr = durationStr.replace(" ", "").replace(",", "").replace("Plays", "").replace("plays", "");
int beishu = 1;
if (StringUtils.containsIgnoreCase(durationStr, "K")) {
beishu = 1000;
durationStr = durationStr.replace("K", "");
}
try {
if (isNumber(durationStr)) {
me.views = (int) (Double.parseDouble(durationStr) * 1000);
} else {
durationStr = durationStr.substring(0, durationReg.length() - 1);
try {
me.views = Integer.parseInt(durationStr);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
} catch (NumberFormatException e) {
logger.error(e);
}
}
private void getIcon(TagNode tag, MediaEntry me) {
String icon = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, "//meta[@property='og:image']", "content");
if (StringUtils.trimToNull(icon) != null) {
me.iconurl = icon;
}
}
private void getUrl(TagNode tag, MediaEntry me) {
String script = "//meta[@property='og:video:url']";
String videoStr = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, script, "content");
String videoId = getVideoId(videoStr);
if (StringUtils.trimToNull(videoId) == null) {
TagNode[] tagnode = HtmlCleanerUtil.getInstance().parseTagNodeArrayByTag(tag, script);
for(TagNode t:tagnode){
videoStr = t.getAttributeByName("content");
if(StringUtils.containsIgnoreCase(videoStr, "clip_id")){
videoStr = videoStr.replace("&", "").replace("", "").replace("autoplay=1", "");
break;
}
}
videoId = getVideoId(videoStr);
if(!HtmlCleanerUtil.getInstance().isNumber(videoId)){
videoId = "";
}
}
String request = "";
if(StringUtils.trimToNull(videoId)==null){
request = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, "//div[@class='player js-player ']", "data-config-url");
request = request.replace("&", "&");
}else{
request = "https://player.vimeo.com/video/" + videoId + "/config";
}
String returnS = XinBaseUtil.getUrlString(request, "UTF-8", new String[][] {
{ "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0" }, { "Referer", "https://vimeo.com/" } });
if (StringUtils.trimToNull(returnS) == null) {
return;
}
try {
JSONObject json = new JSONObject(returnS);
if (!json.isNull("request")) {
JSONObject jsonRequest = json.getJSONObject("request");
if (!jsonRequest.isNull("files")) {
JSONObject jsonfile = jsonRequest.getJSONObject("files");
if (!jsonfile.isNull("h264")) {
if (!jsonfile.getJSONObject("h264").isNull("hd")) {
if (!jsonfile.getJSONObject("h264").getJSONObject("hd").isNull("url")) {
me.url = jsonfile.getJSONObject("h264").getJSONObject("hd").getString("url");
}
} else if (!jsonfile.getJSONObject("h264").isNull("sd")) {
if (!jsonfile.getJSONObject("h264").getJSONObject("sd").isNull("url")) {
me.url = jsonfile.getJSONObject("h264").getJSONObject("sd").getString("url");
}
} else if (!jsonfile.getJSONObject("h264").isNull("mobile")) {
if (!jsonfile.getJSONObject("h264").getJSONObject("mobile").isNull("url")) {
me.url = jsonfile.getJSONObject("h264").getJSONObject("mobile").getString("url");
}
}
}
if(!jsonfile.isNull("progressive")){
JSONArray arr = jsonfile.getJSONArray("progressive");
int maxheight = 0;
JSONObject o ;
int temp;
String hq_url ="";
for(int i=0;i<arr.length();i++){
o = arr.getJSONObject(i);
if(!o.isNull("height")){
temp = o.getInt("height");
if(maxheight<temp){
maxheight = temp;
hq_url = o.getString("url");
}
}
}
me.url =hq_url;
}
}
;
}
} catch (JSONException e) {
logger.error(e);
}
}
private String getVideoId(String videoStr) {
if (StringUtils.trimToNull(videoStr) == null) {
return "";
}
if (videoStr.split("clip_id=").length == 2) {
return videoStr.split("clip_id=")[1];
}
return "";
}
@Override
public String getSitename() {
return site;
}
public static boolean isNumber(String str) {
if ((str == null) || (str.trim().length() == 0)) {
return false;
}
String regex = "^[0-9.]*$";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(str);
return (m.find());
}
~~~
- Introduction
- 爬虫相关技能介绍
- 爬虫简单介绍
- 爬虫涉及到的知识点
- 爬虫用途
- 爬虫流程介绍
- 需求描述
- Http请求处理
- http基础知识介绍
- http状态码
- httpheader
- java原生态处理http
- URL类
- 获取URL请求状态
- 模拟Http请求
- apache httpclient
- Httpclient1
- httpclient2
- httpclient3
- httpclient4
- httpclient5
- httpclient6
- okhttp
- OKhttp使用教程
- 技术使用
- java执行javascript
- 网页解析
- Xpath介绍
- HtmlCleaner
- HtmlCleaner介绍
- HtmlCleaner使用
- HtmlParser
- HtmlParser介绍
- Jsoup
- 解析和遍历一个HTML文档
- 解析一个HTML字符串
- 解析一个body片断
- 从一个URL加载一个Document
- 从一个文件加载一个文档
- 使用DOM方法来遍历一个文档
- 使用选择器语法来查找元素
- 从元素抽取属性,文本和HTML
- 处理URLs
- 示例程序 获取所有链接
- 设置属性的值
- 设置一个元素的HTML内容
- 消除不受信任的HTML (来防止XSS攻击)
- 正则表达式
- elasticsearch笔记
- 下载安装elasticsearch
- 检查es服务健康