利用HttpClient及HtmlCltmlCleaner实现的一个CSDN博客搜索下载爬虫

package com.bigdata;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

import java.io.*;

public class CSDN {

    private static HtmlCleaner cleaner;
    private static CloseableHttpClient client;

    public static void main(String[] args) throws Exception {
        client = HttpClients.createDefault();
        String soso = "https://so.csdn.net/so/search/s.do?p=1&q=学习&t=blog";
        System.out.println("爬取地址为：" + soso);
        HttpGet httpGet = new HttpGet(soso);
        //点击后返回响应数据包
        CloseableHttpResponse execute = client.execute(httpGet);
        //从响应数据包中提取响应实体
        HttpEntity entity = execute.getEntity();
        //从响应实体中获得输入流
        InputStream inputStream = entity.getContent();
        //从响应实体中获得字符串
        String s = EntityUtils.toString(entity);
        //获得HtmlCleaner对象
        cleaner = new HtmlCleaner();
        TagNode clean = cleaner.clean(s);
        String PageNumberPath = "//a[@class='btn btn-xs btn-default'][5]/text()";
        Object[] objects = clean.evaluateXPath(PageNumberPath);
        String stringPageNumber = objects[0].toString().replaceAll("\\D", "");
        int PageNumber = Integer.parseInt(stringPageNumber);
        for (int i = 1; i <= PageNumber; i++) {
            String url = "https://so.csdn.net/so/search/s.do?p=" + i + "&q=学习&t=blog&domain=&o=&s=&u=&l=&f=&rbg=0";
            DownlodList(url, i);
        }
    }

    //解析文章列表
    public static void DownlodList(String URL, int Page) throws Exception {
        HttpGet httpGet = new HttpGet(URL);
        CloseableHttpResponse execute = client.execute(httpGet);
        HttpEntity entity = execute.getEntity();
        String string = EntityUtils.toString(entity);
        TagNode clean = cleaner.clean(string);
        for (int i = 1; i <= 10; i++) {
            File file = new File("E:/CSDN/"+Page);
            if(!file.exists()){
                file.mkdir();
            }
            String ListPath = "//dl[@class='search-list J_search']["+i+"]/dd[@class='author-time']/span[@class='link']/a";
            Object[] objects = clean.evaluateXPath(ListPath);
            for (Object object : objects) {
                DownloadPage(object.toString(),Page);
            }
        }
    }

    //具体文章解析下载
    public static void DownloadPage(String PageURL,int Page) throws Exception {
        HttpGet httpGet = new HttpGet(PageURL);
        CloseableHttpResponse execute = client.execute(httpGet);
        HttpEntity entity = execute.getEntity();
        String string = EntityUtils.toString(entity);
        TagNode clean = cleaner.clean(string);
        String TitlePath = "//h1[@class='title-article']/text()";
        Object[] Title = clean.evaluateXPath(TitlePath);
        String Context = "//div[@id='content_views']//text()";
        Object[] Contexts = clean.evaluateXPath(Context);
        //创建文件数据输入流
        ByteArrayInputStream fileInputStreams = new ByteArrayInputStream(Contexts[0].toString().getBytes("UTF-8"));
        //创建文件输出流对象
        File file = new File("E:/CSDN/" +Page+"/"+ Title[0] + ".txt");
        //文件不存在则创建
        boolean newFile = false;
        if (!file.exists()) {
            newFile = file.createNewFile();
        }
        FileOutputStream fileOutputStream = new FileOutputStream(file);
        IOUtils.copy(fileInputStreams, fileOutputStream);
    }
}