利用开源插件html-unit
https://github.com/xautlx/nutch-htmlunit
把插件倒入到nutch环境中
但是在执行过程中,会出现各种错误。原因是lib-htmlunit的HttpWebClient有问题,
作了如下修改:
package org.apache.nutch.protocol.htmlunit;
import org.apache.hadoop.conf.Configuration;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.net.URL;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.html.HtmlPage;import com.gargoylesoftware.htmlunit.html.HtmlInput;import com.gargoylesoftware.htmlunit.WebRequest;import com.gargoylesoftware.htmlunit.AjaxController;import com.gargoylesoftware.htmlunit.BrowserVersion;/** * Htmlunit WebClient Helper * Use one WebClient instance per thread by ThreadLocal to support multiple threads execution */public class HttpWebClient { private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol"); private static ThreadLocal<WebClient> threadWebClient = new ThreadLocal<WebClient>(); public static HtmlPage getHtmlPage(String url, Configuration conf) { try { WebClient webClient = threadWebClient.get(); if (webClient == null) { LOG.info("Initing web client for thread: {}", Thread.currentThread().getId()); AjaxController ajaxController = new NicelyResynchronizingAjaxController(); webClient = new WebClient(BrowserVersion.FIREFOX_17); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setJavaScriptEnabled(true); webClient.setAjaxController(ajaxController); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setPrintContentOnFailingStatusCode(false); webClient.getOptions().setRedirectEnabled(true); webClient.getOptions().setPopupBlockerEnabled(true); webClient.setCache(new ExtHtmlunitCache()); // Enhanced WebConnection based on urlfilter//百度云盘基本都是Ajax实现的,提供了账号密码方式
HtmlPage loginPage = webClient.getPage("http://yun.baidu.com");
loginPage.getElementById("TANGRAM__PSP_4__userName").setAttribute("value","280889189"); loginPage.getElementById("TANGRAM__PSP_4__password").setAttribute("value","123578951"); loginPage = ((HtmlInput)loginPage.getElementById("TANGRAM__PSP_4__submit")).click(); webClient.setWebConnection(new RegexHttpWebConnection(webClient,conf)); threadWebClient.set(webClient); } HtmlPage page = webClient.getPage(url);// webClient.closeAllWindows(); return page; } catch (Exception e) { throw new RuntimeException(e); } } public static HtmlPage getHtmlPage(String url) { return getHtmlPage(url, null); }}