/**
* https://www.hxm5.com/t/2065908 线报抓取
*
* @param url
* @return Quartet<String, String, List < String>,List<String>> 标题 内容 图片 超链接
*/
public static Quartet<String, String, List<String>, List<String>> getShoppingTipInfo(String url) {
String title = null;
StringBuilder content = new StringBuilder();
ArrayList<String> imageUrl = Lists.newArrayList();
ArrayList<String> hrefUrl = Lists.newArrayList();
try {
URI host = URLUtil.getHost(new URL(url));
Request.Builder request = getRequestBuilder(url);
request.removeHeader("Referer").removeHeader("Host");
// system proxy request
final OkHttpClient.Builder okHttpClientBuilder = new OkHttpClient.Builder()
.connectTimeout(10, TimeUnit.SECONDS)
.readTimeout(10, TimeUnit.SECONDS)
.writeTimeout(10, TimeUnit.SECONDS);
if (System.getProperty("system.net.proxy")!=null){
String property = System.getProperty("system.net.proxy");
String [] proxyAddr = property.split(":");
okHttpClientBuilder.proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyAddr [0], Integer.parseInt(proxyAddr [1]))));
}
OkHttpClient okHttpClient = okHttpClientBuilder.build();
String resultJSJson = new String(okHttpClient.newCall(request.build()).execute().body().bytes());
Document jsoupDocument = getJsoupDocument(resultJSJson);
Element elementById = jsoupDocument.body().getElementById("topic-desc");
title = elementById.children().select("h1").text();
// content in <p> element
Elements elementsContents = elementById.children().select("p");
for (Element elementsContent : elementsContents) {
for (Node pChild : elementsContent.childNodes()) {
if (pChild instanceof TextNode) {
content.append(((TextNode) pChild).text());
Node preNode = pChild.previousSibling();
if (!(preNode instanceof TextNode)) {
content.append(StringUtils.LF);
}
}
if (pChild instanceof Element) {
if (((Element) pChild).is("a")) { // a href url
String href = host + pChild.attr("href");
request = getRequestBuilder(href);
request.removeHeader("Host");
resultJSJson =okHttpClient.newCall(request.build()).execute().request().url().toString();// Direct url
content.append(resultJSJson);
hrefUrl.add(href);
content.append(StringUtils.LF);
}
if (((Element) pChild).is("img")) { // image url
String imageHref = pChild.attr("data-original");
content.append(imageHref);
imageUrl.add(imageHref);
content.append(StringUtils.LF);
}
}
}
}
} catch (Exception e) {
log.error(e.getMessage());
}
return Quartet.with(title, content.toString(), imageUrl, hrefUrl);
}