用JAVA写爬虫 轻松刷CSDN网页博客访问量 排名优化
1.导入依赖
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.3</version> <!-- or latest version --> </dependency>
2.获取代理
大部分网站对同IP访问进行了限制 [可以去注册一个IP代理的账户,免费5000个IP ]
《JAVA爬虫刷CSDN网页浏览量热度和搜索排名获取代理IP》
public class ProxyIP { public static String getOne() throws IOException { String Proxyurl = "填入代理IP的提取链接 一次获取一个 "; String ip = Jsoup.connect(Proxyurl).get().body().text(); return ip; } }
3. 爬取页面信息
以CSDN为例 点击F12 找到元素所处的位置
4.使用 Jsoup 爬取内容
package com.sgg.main; import com.sgg.main.proxy.ProxyIP; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import java.io.IOException; import java.net.Proxy; import java.net.URLDecoder; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Random; import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; public class Index { public static void main(String[] args) throws IOException { //博文地址URL集合 ArrayList<String> urls = new ArrayList<>(); //博文名集合 ArrayList<String> names = new ArrayList<>(); // 创建httpClient实例 String indexUrl = "https://xxxxxxxxxx"; // 访问博文首页 爬取博文地址和博文名 Document document = Jsoup.connect(indexUrl).get(); Elements article = document.getElementsByTag("article"); article.forEach(new Consumer<Element>() { @Override public void accept(Element element) { Node node = element.childNode(0); Node parentNode = node.childNode(0).childNode(0).parentNode(); //爬取博文名 String nameUrl = parentNode.childNode(0).childNode(0).toString(); names.add(nameUrl); Attributes attributes = node.attributes(); String href = attributes.get("href"); //爬取博文URL urls.add(href); } }); //开启线程 ExecutorService executorService = Executors.newFixedThreadPool(urls.size()); for (int i = 0; i < urls.size(); i++) { //每个连接地址使用一个线程 int finalI = i; executorService.execute(() -> { while (true) { String[] split = null; try { split = ProxyIP.getOne().split(":"); } catch (IOException e) { e.printStackTrace(); } try { //搜索引擎搜索帖子 优化搜索排名 String decode = URLEncoder.encode(names.get(finalI), "utf-8"); // 搜索RUL String searchUrl = "https:xxxxxxxxxxxx"; Connection.Response search = Jsoup.connect(searchUrl).proxy(split[0], Integer.parseInt(split[1])).ignoreContentType(true).execute(); Connection.Response response = Jsoup.connect(urls.get(finalI)).proxy(split[0], Integer.parseInt(split[1])).ignoreContentType(true).execute(); System.out.println(names.get(finalI).substring(0,5)+" 搜索状态: "+search.statusCode()+" ---> "+urls.get(finalI).split("details/")[1] + " 访问状态 " + response.statusCode()); try { TimeUnit.SECONDS.sleep(30+ new Random().nextInt(10)); } catch (InterruptedException e) { } } catch (IOException e) { System.out.println(names.get(finalI).substring(0,5)+" "+urls.get(finalI).split("details/")[1] + " 访问出错了"); System.out.println(e.getMessage()); } } }); } } }#Java开发##秋招##Java##学习路径#