java代码
/**
* https://www.hxm5.com/t/2065908 线报抓取
*
* @param url
* @return Quartet<String, String, List < String>,List<String>> 标题 内容 图片 超链接
*/
public static Quartet<String, String, List<String>, List<String>> getShoppingTipInfo(String url) {
String title = null;
StringBuilder content = new StringBuilder();
ArrayList<String> imageUrl = Lists.newArrayList();
ArrayList<String> hrefUrl = Lists.newArrayList();
try {
URI host = URLUtil.getHost(new URL(url));
Document jsoupDocument = getJsoupDocument(HttpUtil.get(url));
Element elementById = jsoupDocument.body().getElementById("topic-desc");
title = elementById.children().select("h1").text();
// content in <p> element
Elements elementsContents = elementById.children().select("p");
for (Element elementsContent : elementsContents) {
for (Node pChild : elementsContent.childNodes()) {
if (pChild instanceof TextNode) {
content.append(((TextNode) pChild).text());
Node preNode = pChild.previousSibling();
if (!(preNode instanceof TextNode)) {
content.append(StringUtils.LF);
}
}
if (pChild instanceof Element) {
if (((Element) pChild).is("a")) { // a href url
String href = host + pChild.attr("href");
Request.Builder request = getRequestBuilder(href);
request.removeHeader("Host");
String resultJSJson = getOkHttp().newCall(request.build()).execute().request().url().toString();// Direct url
content.append(resultJSJson);
hrefUrl.add(href);
content.append(StringUtils.LF);
}
if (((Element) pChild).is("img")) { // image url
String imageHref = pChild.attr("data-original");
content.append(imageHref);
imageUrl.add(imageHref);
content.append(StringUtils.LF);
}
}
}
}
} catch (Exception e) {
log.error(e.getMessage());
}
return Quartet.with(title, content.toString(), imageUrl, hrefUrl);
}
maven依赖
<dependency>
<groupId>org.javatuples</groupId>
<artifactId>javatuples</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>