线报url内容获取

wangfei 2022-12-14 14:24:30 3330

 /**
     * https://www.hxm5.com/t/2065908  线报抓取
     *
     * @param url
     * @return Quartet<String, String, List < String>,List<String>>  标题 内容 图片 超链接
     */
    public static Quartet<String, String, List<String>, List<String>> getShoppingTipInfo(String url) {
        String title = null;
        StringBuilder content = new StringBuilder();
        ArrayList<String> imageUrl = Lists.newArrayList();
        ArrayList<String> hrefUrl = Lists.newArrayList();
        try {
            URI host = URLUtil.getHost(new URL(url));
            Request.Builder request = getRequestBuilder(url);
            request.removeHeader("Referer").removeHeader("Host");
            // system proxy request
            final OkHttpClient.Builder okHttpClientBuilder = new OkHttpClient.Builder()
                    .connectTimeout(10, TimeUnit.SECONDS)
                    .readTimeout(10, TimeUnit.SECONDS)
                    .writeTimeout(10, TimeUnit.SECONDS);
            if (System.getProperty("system.net.proxy")!=null){
                 String property = System.getProperty("system.net.proxy");
                 String [] proxyAddr = property.split(":");
                okHttpClientBuilder.proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyAddr [0], Integer.parseInt(proxyAddr [1]))));
            }
            OkHttpClient okHttpClient = okHttpClientBuilder.build();
            String resultJSJson = new String(okHttpClient.newCall(request.build()).execute().body().bytes());
            Document jsoupDocument = getJsoupDocument(resultJSJson);
            Element elementById = jsoupDocument.body().getElementById("topic-desc");
            title = elementById.children().select("h1").text();
            // content in <p> element
            Elements elementsContents = elementById.children().select("p");
            for (Element elementsContent : elementsContents) {
                for (Node pChild : elementsContent.childNodes()) {
                    if (pChild instanceof TextNode) {
                        content.append(((TextNode) pChild).text());
                        Node preNode = pChild.previousSibling();
                        if (!(preNode instanceof TextNode)) {
                            content.append(StringUtils.LF);
                        }
                    }
                    if (pChild instanceof Element) {
                        if (((Element) pChild).is("a")) { // a href url
                            String href = host + pChild.attr("href");
                            request = getRequestBuilder(href);
                            request.removeHeader("Host");
                            resultJSJson =okHttpClient.newCall(request.build()).execute().request().url().toString();// Direct url
                            content.append(resultJSJson);
                            hrefUrl.add(href);
                            content.append(StringUtils.LF);
                        }
                        if (((Element) pChild).is("img")) { // image url
                            String imageHref = pChild.attr("data-original");
                            content.append(imageHref);
                            imageUrl.add(imageHref);
                            content.append(StringUtils.LF);
                        }
                    }
                }
            }
        } catch (Exception e) {
            log.error(e.getMessage());
        }
        return Quartet.with(title, content.toString(), imageUrl, hrefUrl);
    }
这家伙太懒了,什么也没留下。

社区声明 1、本站提供的一切软件、教程和内容信息仅限用于学习和研究目的
2、本站资源为用户分享,如有侵权请邮件与我们联系处理敬请谅解!
3、本站信息来自网络,版权争议与本站无关。您必须在下载后的24小时之内,从您的电脑或手机中彻底删除上述内容
最新回复 (1)

您可以在 登录 or 注册 后,对此帖发表评论!

返回