HtmlCleaner使用 · 网络爬虫知识汇总

##HtmlCleaner使用 <div><br></div><div>下面展示一段在工作当中总结的关于Htmlcleaner的工具类</div><div><br></div><div>htmlcleaner个人觉的非常好用，是我写爬虫的时候经常用到的最多的一种方式，一般跟xpath结合，关于xpath的内容，查看xpath相关内容。</div><div><br></div><div><hr><br></div><div> ~~~ 下面展示一段在工作当中总结的关于Htmlcleaner的工具类 htmlcleaner个人觉的非常好用，是我写爬虫的时候经常用到的最多的一种方式，一般跟xpath结合，关于xpath的内容，查看xpath相关内容。 public class HtmlCleanerUtil { public static String simpleFormat = "yyyy-MM-dd HH:mm:ss"; private static Logger logger = Logger.getLogger(HtmlCleanerUtil.class); private static HtmlCleaner cleaner=null; private static HtmlCleanerUtil instance = new HtmlCleanerUtil(); private Map<String ,String> monthMap = new HashMap<String,String>(); private HtmlCleanerUtil(){ cleaner = new HtmlCleaner(); } public static HtmlCleanerUtil getInstance(){ synchronized (instance) { if(instance==null){ instance=new HtmlCleanerUtil(); } } return instance; } public Object[] parseObjectByTag(TagNode tag,String evluatePath){ Object[] res = null; try { res= tag.evaluateXPath(evluatePath); } catch (Exception e) { //System.out.println("******parser tag error***********"); logger.error("parseTagurl", e);//e.printStackTrace(); } return res; } public TagNode[] parseTagNodeArrayByTag(TagNode tag,String evluatePath){ Object[] res = null; List<TagNode> result = null; try { result = new ArrayList<TagNode>(); res= tag.evaluateXPath(evluatePath); if(res!=null &&res.length>0){ for(Object o:res){ if(o instanceof TagNode){ result.add((TagNode)o); } } } } catch (Exception e) { //System.out.println("******parser tag error***********"); logger.error("parseTagurl", e);//e.printStackTrace(); } TagNode[] resa = new TagNode[result.size()]; return result.toArray(resa); } //get tag text public String parserTextByTag(TagNode tag,String evluatePath){ String res = ""; try { Object[] obj = tag.evaluateXPath(evluatePath); if (obj.length > 0) { StringBuffer sb = ((TagNode) obj[0]).getText(); if(sb!=null) res=sb.toString(); } } catch (Exception e) { //System.out.println("******parser tag error***********"); logger.error("parseTagurl", e);//e.printStackTrace(); } return res; } //get tag property value public String parserAttrByTag(TagNode tag,String attrName,String attrValue,String attrProperty ) { String res = ""; try { Object[] obj = tag.getElementsByAttValue(attrName, attrValue,true, true); if (obj.length > 0) { res = ((TagNode) obj[0]).getAttributeByName(attrProperty); } } catch (Exception e) { logger.error("parserAttrByTag", e);//e.printStackTrace(); } return res; } //get tag property value public String parserAttrByXpath(TagNode tag,String evluatePath,String attrProperty ) { String res = ""; try { Object[] obj = tag.evaluateXPath(evluatePath); if (obj.length > 0) { res = ((TagNode) obj[0]).getAttributeByName(attrProperty); } } catch (Exception e) { logger.error("parserAttrByTag", e);//e.printStackTrace(); } return res; } public TagNode getHtmlTag(String content){ TagNode tag = cleaner.clean(content); return tag; } public String patternStr(String content,String regex){ if(content==null) return ""; String res=""; Pattern p=Pattern.compile(regex); Matcher m=p.matcher(content); content = null; if(m.find()){ if(m.group()!=null) res=m.group(1); } return res; } public boolean isNumber(String str){ if(str==null ||str.trim().length()==0) return false; String regex="^[0-9]*$"; Pattern p=Pattern.compile(regex); Matcher m=p.matcher(str); if(m.find()){ return true; } return false; } } 使用案例定义(vimeo.com)一个parser，针对每一个网站进行parse public class VimeoParser extends SearchSite implements ListHandler, PageHandler { private static Logger logger = Logger.getLogger(VimeoParser.class); private String site = "vimeo.com"; DefaultHttpClient client = new DefaultHttpClient(); public static HeaderGroup headers = new HeaderGroup(); static { headers.addHeader(new BasicHeader("Host", "vimeo.com")); // Cookie","searchtoken=2e09594c } @Override public boolean parsePage(String content, MediaEntry me) { me.site = site; me.views = -1; me.comments = -1; me.favorites = -1; TagNode tag = HtmlCleanerUtil.getInstance().getHtmlTag(content); getTitle(tag, me); getDesc(tag, me); getUrl(tag, me); getIcon(tag, me); getViews(tag, me); // System.out.println(me.views); getAddDate(tag, me); // System.out.println(me.adddate); getDuration(tag, me); getTag(tag, me); getCatagory(tag, me); content = null;// for GC return XinUtil.parsePageReturn(me); } @Override public ArrayList<MediaEntry> parseList(String content) { return HtmlParserUtil.getInstance().parseLinkTag(site, content, "//div[@class='thumbnail_format']//div[@class='title']//a"); } public String getHtmltoCookie(String url) { String ret = ""; HttpGet httpget = new HttpGet(url); httpget.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10"); httpget.addHeader("Host", "vimeo.com"); httpget.addHeader("Referer", url); httpget.addHeader("Accept-Encoding", "gzip,deflate"); httpget.addHeader("Keep-Alive", "115"); httpget.addHeader("Connection", "Keep-Alive"); httpget.addHeader("Cookie", "searchtoken=2e09594c"); httpget.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); ResponseHandler<String> responseHandler = new BasicResponseHandler(); try { ret = client.execute(httpget, responseHandler); } catch (Exception e) { e.printStackTrace(); } return ret; } /** * @param args */ public static void main(String[] args) { boolean result = isNumber("35.3"); System.out.println(result); // System.setProperty("http.proxyHost", "127.0.0.1"); // System.setProperty("http.proxyPort", "8087"); String pageUrl = "https://vimeo.com/106161632";// http://vimeo.com/44649795 // http://vimeo.com/channels/nipwitz/114951438 // http://vimeo.com/114951438 // https://vimeo.com/ondemand/highmaintenance //https://vimeo.com/ondemand/5182 //http://vimeo.com/4749536 // String url = "https://player.vimeo.com/video/114951438"; String content = XinBaseUtil.getUrlString(pageUrl, "UTF-8", new String[][] { { "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0" } }); MediaEntry me = new MediaEntry(); System.out.println(content); me.pageurl = pageUrl; // MediaEntry me=new MediaEntry(); VimeoParser vimeo = new VimeoParser(); // String html = HtmlParserUtil.getHtml(pageUrl, null); // String html=XinBaseUtil.getUrlString(pageUrl,"UTF-8", new // String[][]{{"Host","Vimeo"},{"roxy-Connection","keep-alive"},{"User-Agent","Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/17.0 Firefox/17.0"}}); System.out.println(content.length()); System.out.println("result = "+vimeo.parsePage(content, me)); System.out.println(me); // String // listUrl="http://www.vimeo.com/search/videos/search:star/st/2e09594c"; // String listUrl=vimeo.getListUrl(0, "star", 2); // System.out.println(html); } @Override public String getListUrl(int type, String searchString, int page) { if (page == 0) page = 1; String listurl = "http://vimeo.com/search/videos/search:" + searchString + "/st/2e09594c/page:" + page + "/sort:relevant/format:thumbnail"; return listurl; } private void getDesc(TagNode tag, MediaEntry me) { String script = "//meta[@property='og:description']"; String desc = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, script, "content"); // String desc = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, // script, "content"); if (desc != null) me.description = desc; else me.description = ""; } private void getTitle(TagNode tag, MediaEntry me) { // title String title = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, "//meta[@property='og:title']", "content"); if (title != null && title.contains("vimeo")) { title = title.replace("vimeo", ""); } me.title = title; } private void getCatagory(TagNode tag, MediaEntry me) { String cata = ""; // String regex = "//div[@class='info_row_cat']"; // cata = HtmlCleanerUtil.getInstance().parserTextByTag(tag, regex); // cata = cata.replace("Categories:", "").replace(" ", // "").replace("\r\n", "").replace("\n", "").replace("\r", ""); me.category = cata; } private void getTag(TagNode tag, MediaEntry me) { ArrayList<String> tags = new ArrayList<String>(); // String regex = "//div[@class='info_row info_row_tag']/a"; // Object[] objArr = HtmlCleanerUtil.getInstance().parseObjectByTag(tag, // regex); // if (objArr != null && objArr.length > 0) { // for (Object oo : objArr) { // tags.add(((TagNode) oo).getText().toString()); // } // } me.tags = tags; } private void getDuration(TagNode tag, MediaEntry me) { me.duration = -1; } private void getAddDate(TagNode tag, MediaEntry me) { // duration String adtnReg = "//time"; String adtStr = ""; adtStr = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, adtnReg, "datetime"); if (StringUtils.trimToNull(adtStr) == null) { return; } adtStr = adtStr.replace(",", "").replace(" ", "").split("T")[0]; if (StringUtils.trimToNull(adtStr) != null) { me.adddate = adtStr + " 00:00:00"; } } private void getViews(TagNode tag, MediaEntry me) { // duration String durationReg = "//span[@class='plays']"; String durationStr = ""; durationStr = HtmlCleanerUtil.getInstance().parserTextByTag(tag, durationReg); if (StringUtils.trimToNull(durationStr) == null) { return; } durationStr = durationStr.replace(" ", "").replace(",", "").replace("Plays", "").replace("plays", ""); int beishu = 1; if (StringUtils.containsIgnoreCase(durationStr, "K")) { beishu = 1000; durationStr = durationStr.replace("K", ""); } try { if (isNumber(durationStr)) { me.views = (int) (Double.parseDouble(durationStr) * 1000); } else { durationStr = durationStr.substring(0, durationReg.length() - 1); try { me.views = Integer.parseInt(durationStr); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } catch (NumberFormatException e) { logger.error(e); } } private void getIcon(TagNode tag, MediaEntry me) { String icon = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, "//meta[@property='og:image']", "content"); if (StringUtils.trimToNull(icon) != null) { me.iconurl = icon; } } private void getUrl(TagNode tag, MediaEntry me) { String script = "//meta[@property='og:video:url']"; String videoStr = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, script, "content"); String videoId = getVideoId(videoStr); if (StringUtils.trimToNull(videoId) == null) { TagNode[] tagnode = HtmlCleanerUtil.getInstance().parseTagNodeArrayByTag(tag, script); for(TagNode t:tagnode){ videoStr = t.getAttributeByName("content"); if(StringUtils.containsIgnoreCase(videoStr, "clip_id")){ videoStr = videoStr.replace("&", "").replace("", "").replace("autoplay=1", ""); break; } } videoId = getVideoId(videoStr); if(!HtmlCleanerUtil.getInstance().isNumber(videoId)){ videoId = ""; } } String request = ""; if(StringUtils.trimToNull(videoId)==null){ request = HtmlCleanerUtil.getInstance().parserAttrByXpath(tag, "//div[@class='player js-player ']", "data-config-url"); request = request.replace("&", "&"); }else{ request = "https://player.vimeo.com/video/" + videoId + "/config"; } String returnS = XinBaseUtil.getUrlString(request, "UTF-8", new String[][] { { "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0" }, { "Referer", "https://vimeo.com/" } }); if (StringUtils.trimToNull(returnS) == null) { return; } try { JSONObject json = new JSONObject(returnS); if (!json.isNull("request")) { JSONObject jsonRequest = json.getJSONObject("request"); if (!jsonRequest.isNull("files")) { JSONObject jsonfile = jsonRequest.getJSONObject("files"); if (!jsonfile.isNull("h264")) { if (!jsonfile.getJSONObject("h264").isNull("hd")) { if (!jsonfile.getJSONObject("h264").getJSONObject("hd").isNull("url")) { me.url = jsonfile.getJSONObject("h264").getJSONObject("hd").getString("url"); } } else if (!jsonfile.getJSONObject("h264").isNull("sd")) { if (!jsonfile.getJSONObject("h264").getJSONObject("sd").isNull("url")) { me.url = jsonfile.getJSONObject("h264").getJSONObject("sd").getString("url"); } } else if (!jsonfile.getJSONObject("h264").isNull("mobile")) { if (!jsonfile.getJSONObject("h264").getJSONObject("mobile").isNull("url")) { me.url = jsonfile.getJSONObject("h264").getJSONObject("mobile").getString("url"); } } } if(!jsonfile.isNull("progressive")){ JSONArray arr = jsonfile.getJSONArray("progressive"); int maxheight = 0; JSONObject o ; int temp; String hq_url =""; for(int i=0;i<arr.length();i++){ o = arr.getJSONObject(i); if(!o.isNull("height")){ temp = o.getInt("height"); if(maxheight<temp){ maxheight = temp; hq_url = o.getString("url"); } } } me.url =hq_url; } } ; } } catch (JSONException e) { logger.error(e); } } private String getVideoId(String videoStr) { if (StringUtils.trimToNull(videoStr) == null) { return ""; } if (videoStr.split("clip_id=").length == 2) { return videoStr.split("clip_id=")[1]; } return ""; } @Override public String getSitename() { return site; } public static boolean isNumber(String str) { if ((str == null) || (str.trim().length() == 0)) { return false; } String regex = "^[0-9.]*$"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(str); return (m.find()); } ~~~