利用HttpClient及HtmlCltmlCleaner实现的一个CSDN博客搜索下载爬虫

package com.bigdata;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

import java.io.*;

public class CSDN {

    private static HtmlCleaner cleaner;
    private static CloseableHttpClient client;

    public static void main(String[] args) throws Exception {
        client = HttpClients.createDefault();
        String soso = "https://so.csdn.net/so/search/s.do?p=1&q=学习&t=blog";
        System.out.println("爬取地址为:" + soso);
        HttpGet httpGet = new HttpGet(soso);
        //点击后返回响应数据包
        CloseableHttpResponse execute = client.execute(httpGet);
        //从响应数据包中提取响应实体
        HttpEntity entity = execute.getEntity();
        //从响应实体中获得输入流
        InputStream inputStream = entity.getContent();
        //从响应实体中获得字符串
        String s = EntityUtils.toString(entity);
        //获得HtmlCleaner对象
        cleaner = new HtmlCleaner();
        TagNode clean = cleaner.clean(s);
        String PageNumberPath = "//a[@class='btn btn-xs btn-default'][5]/text()";
        Object[] objects = clean.evaluateXPath(PageNumberPath);
        String stringPageNumber = objects[0].toString().replaceAll("\\D", "");
        int PageNumber = Integer.parseInt(stringPageNumber);
        for (int i = 1; i <= PageNumber; i++) {
            String url = "https://so.csdn.net/so/search/s.do?p=" + i + "&q=学习&t=blog&domain=&o=&s=&u=&l=&f=&rbg=0";
            DownlodList(url, i);
        }
    }

    //解析文章列表
    public static void DownlodList(String URL, int Page) throws Exception {
        HttpGet httpGet = new HttpGet(URL);
        CloseableHttpResponse execute = client.execute(httpGet);
        HttpEntity entity = execute.getEntity();
        String string = EntityUtils.toString(entity);
        TagNode clean = cleaner.clean(string);
        for (int i = 1; i <= 10; i++) {
            File file = new File("E:/CSDN/"+Page);
            if(!file.exists()){
                file.mkdir();
            }
            String ListPath = "//dl[@class='search-list J_search']["+i+"]/dd[@class='author-time']/span[@class='link']/a";
            Object[] objects = clean.evaluateXPath(ListPath);
            for (Object object : objects) {
                DownloadPage(object.toString(),Page);
            }
        }
    }

    //具体文章解析下载
    public static void DownloadPage(String PageURL,int Page) throws Exception {
        HttpGet httpGet = new HttpGet(PageURL);
        CloseableHttpResponse execute = client.execute(httpGet);
        HttpEntity entity = execute.getEntity();
        String string = EntityUtils.toString(entity);
        TagNode clean = cleaner.clean(string);
        String TitlePath = "//h1[@class='title-article']/text()";
        Object[] Title = clean.evaluateXPath(TitlePath);
        String Context = "//div[@id='content_views']//text()";
        Object[] Contexts = clean.evaluateXPath(Context);
        //创建文件数据输入流
        ByteArrayInputStream fileInputStreams = new ByteArrayInputStream(Contexts[0].toString().getBytes("UTF-8"));
        //创建文件输出流对象
        File file = new File("E:/CSDN/" +Page+"/"+ Title[0] + ".txt");
        //文件不存在则创建
        boolean newFile = false;
        if (!file.exists()) {
            newFile = file.createNewFile();
        }
        FileOutputStream fileOutputStream = new FileOutputStream(file);
        IOUtils.copy(fileInputStreams, fileOutputStream);
    }
}

 

全部评论

相关推荐

最近又搬回宿舍了,在工位坐不住,写一写秋招起伏不断的心态变化,也算对自己心态的一些思考表演式学习从开始为实习准备的时候就特别焦虑,楼主一开始选择的是cpp后端,但是24届这个方向已经炸了,同时自己又因为本科非92且非科班,所以感到机会更加迷茫。在某天晚上用java写出hello&nbsp;world并失眠一整晚后选择老本行干嵌入式。理想是美好的,现实情况是每天忙但又没有实质性进展,总是在配环境,调工具,顺带还要推科研。而这时候才发现自己一直在表演式学习,徘徊在设想如何展开工作的循环里,导致没有实质性进展。现在看来当时如果把精力专注在动手写而不是两只手端着看教程,基本功或许不会那么差。实习的焦虑5月,楼主...
耶比:哲学上有一个问题,玛丽的房间:玛丽知道眼睛识别色彩的原理知道各种颜色,但是她生活在黑白的房间里,直到有一天玛丽的房门打开了她亲眼看到了颜色,才知道什么是色彩。我现在最大可能的减少对非工作事情的思考,如果有一件事困扰了我, 能解决的我就直接做(去哪里或者和谁吵架等等……),解决不了的我就不想了,每一天都是最年轻的一天,珍惜今天吧
投递比亚迪等公司10个岗位 > 秋招被确诊为…… 牛客创作赏金赛
点赞 评论 收藏
分享
评论
点赞
收藏
分享
牛客网
牛客企业服务