用JAVA写爬虫 轻松刷CSDN网页博客访问量 排名优化
1.导入依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version> <!-- or latest version -->
</dependency>2.获取代理
大部分网站对同IP访问进行了限制 [可以去注册一个IP代理的账户,免费5000个IP ]
《JAVA爬虫刷CSDN网页浏览量热度和搜索排名获取代理IP》
public class ProxyIP {
public static String getOne() throws IOException {
String Proxyurl =
"填入代理IP的提取链接 一次获取一个 ";
String ip = Jsoup.connect(Proxyurl).get().body().text();
return ip;
}
}3. 爬取页面信息
以CSDN为例 点击F12 找到元素所处的位置


4.使用 Jsoup 爬取内容
package com.sgg.main;
import com.sgg.main.proxy.ProxyIP;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.Proxy;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
public class Index {
public static void main(String[] args) throws IOException {
//博文地址URL集合
ArrayList<String> urls = new ArrayList<>();
//博文名集合
ArrayList<String> names = new ArrayList<>();
// 创建httpClient实例
String indexUrl = "https://xxxxxxxxxx";
// 访问博文首页 爬取博文地址和博文名
Document document = Jsoup.connect(indexUrl).get();
Elements article = document.getElementsByTag("article");
article.forEach(new Consumer<Element>() {
@Override
public void accept(Element element) {
Node node = element.childNode(0);
Node parentNode = node.childNode(0).childNode(0).parentNode();
//爬取博文名
String nameUrl = parentNode.childNode(0).childNode(0).toString();
names.add(nameUrl);
Attributes attributes = node.attributes();
String href = attributes.get("href");
//爬取博文URL
urls.add(href);
}
});
//开启线程
ExecutorService executorService = Executors.newFixedThreadPool(urls.size());
for (int i = 0; i < urls.size(); i++) {
//每个连接地址使用一个线程
int finalI = i;
executorService.execute(() -> {
while (true) {
String[] split = null;
try {
split = ProxyIP.getOne().split(":");
} catch (IOException e) {
e.printStackTrace();
}
try {
//搜索引擎搜索帖子 优化搜索排名
String decode = URLEncoder.encode(names.get(finalI), "utf-8");
// 搜索RUL
String searchUrl =
"https:xxxxxxxxxxxx";
Connection.Response search = Jsoup.connect(searchUrl).proxy(split[0], Integer.parseInt(split[1])).ignoreContentType(true).execute();
Connection.Response response = Jsoup.connect(urls.get(finalI)).proxy(split[0], Integer.parseInt(split[1])).ignoreContentType(true).execute();
System.out.println(names.get(finalI).substring(0,5)+" 搜索状态: "+search.statusCode()+" ---> "+urls.get(finalI).split("details/")[1] + " 访问状态 " + response.statusCode());
try {
TimeUnit.SECONDS.sleep(30+ new Random().nextInt(10));
} catch (InterruptedException e) {
}
} catch (IOException e) {
System.out.println(names.get(finalI).substring(0,5)+" "+urls.get(finalI).split("details/")[1] + " 访问出错了");
System.out.println(e.getMessage());
}
}
});
}
}
}
#Java开发##秋招##Java##学习路径#
查看14道真题和解析
老板电器公司氛围 197人发布