利用HttpClient及HtmlCltmlCleaner实现的一个CSDN博客搜索下载爬虫
package com.bigdata;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import java.io.*;
public class CSDN {
private static HtmlCleaner cleaner;
private static CloseableHttpClient client;
public static void main(String[] args) throws Exception {
client = HttpClients.createDefault();
String soso = "https://so.csdn.net/so/search/s.do?p=1&q=学习&t=blog";
System.out.println("爬取地址为:" + soso);
HttpGet httpGet = new HttpGet(soso);
//点击后返回响应数据包
CloseableHttpResponse execute = client.execute(httpGet);
//从响应数据包中提取响应实体
HttpEntity entity = execute.getEntity();
//从响应实体中获得输入流
InputStream inputStream = entity.getContent();
//从响应实体中获得字符串
String s = EntityUtils.toString(entity);
//获得HtmlCleaner对象
cleaner = new HtmlCleaner();
TagNode clean = cleaner.clean(s);
String PageNumberPath = "//a[@class='btn btn-xs btn-default'][5]/text()";
Object[] objects = clean.evaluateXPath(PageNumberPath);
String stringPageNumber = objects[0].toString().replaceAll("\\D", "");
int PageNumber = Integer.parseInt(stringPageNumber);
for (int i = 1; i <= PageNumber; i++) {
String url = "https://so.csdn.net/so/search/s.do?p=" + i + "&q=学习&t=blog&domain=&o=&s=&u=&l=&f=&rbg=0";
DownlodList(url, i);
}
}
//解析文章列表
public static void DownlodList(String URL, int Page) throws Exception {
HttpGet httpGet = new HttpGet(URL);
CloseableHttpResponse execute = client.execute(httpGet);
HttpEntity entity = execute.getEntity();
String string = EntityUtils.toString(entity);
TagNode clean = cleaner.clean(string);
for (int i = 1; i <= 10; i++) {
File file = new File("E:/CSDN/"+Page);
if(!file.exists()){
file.mkdir();
}
String ListPath = "//dl[@class='search-list J_search']["+i+"]/dd[@class='author-time']/span[@class='link']/a";
Object[] objects = clean.evaluateXPath(ListPath);
for (Object object : objects) {
DownloadPage(object.toString(),Page);
}
}
}
//具体文章解析下载
public static void DownloadPage(String PageURL,int Page) throws Exception {
HttpGet httpGet = new HttpGet(PageURL);
CloseableHttpResponse execute = client.execute(httpGet);
HttpEntity entity = execute.getEntity();
String string = EntityUtils.toString(entity);
TagNode clean = cleaner.clean(string);
String TitlePath = "//h1[@class='title-article']/text()";
Object[] Title = clean.evaluateXPath(TitlePath);
String Context = "//div[@id='content_views']//text()";
Object[] Contexts = clean.evaluateXPath(Context);
//创建文件数据输入流
ByteArrayInputStream fileInputStreams = new ByteArrayInputStream(Contexts[0].toString().getBytes("UTF-8"));
//创建文件输出流对象
File file = new File("E:/CSDN/" +Page+"/"+ Title[0] + ".txt");
//文件不存在则创建
boolean newFile = false;
if (!file.exists()) {
newFile = file.createNewFile();
}
FileOutputStream fileOutputStream = new FileOutputStream(file);
IOUtils.copy(fileInputStreams, fileOutputStream);
}
}