JSoup的简单使用
要解析的网页地址:http://202.194.143.19/asord/asord_hist.php?page=1
要解析的网页表格,一共86页
URL后面的page参数代表了一共多少页,这里采取手动输入的方式,目前共86页
并且解析完成后,存入数据库
数据表对应的实体类
package com.leo;
import java.util.Date;
public class Book {
/** * */
private Integer id;
/** * 题名 */
private String name;
/** * 责任者 */
private String author;
/** * 出版信息 */
private String press;
/** * 荐购日期 */
private Date date;
/** * 荐购状态 */
private String status;
/** * 处理备注 */
private String remark;
public Book() {
}
public Book(Integer id, String name, String author, String press, Date date, String status, String remark) {
this.id = id;
this.name = name;
this.author = author;
this.press = press;
this.date = date;
this.status = status;
this.remark = remark;
}
public Book(String name, String author, String press, Date date, String status, String remark) {
this.name = name;
this.author = author;
this.press = press;
this.date = date;
this.status = status;
this.remark = remark;
}
@Override
public String toString() {
return "Book{" +
"id=" + id +
", name='" + name + '\'' +
", author='" + author + '\'' +
", press='" + press + '\'' +
", date=" + date +
", status='" + status + '\'' +
", remark='" + remark + '\'' +
'}';
}
public Book getBook() {
return this;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getPress() {
return press;
}
public void setPress(String press) {
this.press = press;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getRemark() {
return remark;
}
public void setRemark(String remark) {
this.remark = remark;
}
}
SqlSession sqlSession = DBSession.getSession();
List<Document> documents = new ArrayList<>(88);
try {
for (int i = 0; i < 86; i++) {
System.out.print("正在解析第 " + (i + 1) + " 页 : ");
documents.add(Jsoup.connect("http://202.194.143.19/asord/asord_hist.php")
.data("page", String.valueOf(i + 1))
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36")
.timeout(3000)
.get());
System.out.println("解析成功");
//Thread.sleep(500);
}
for (int i = 0; i < documents.size(); i++) {
Elements table = documents.get(i).getElementsByClass("table_line");
Elements tr = table.select("tr");
for (int j = 1; j < tr.size(); j++) {
Elements tds = tr.get(j).getElementsByClass("whitetext");
Book book = new Book(
tds.get(1).text(),
tds.get(2).text(),
tds.get(3).text(),
Date.valueOf(tds.get(4).text()),
tds.get(5).text(),
tds.get(6).text());
sqlSession.insert("insert", book);
}
}
} catch (IOException e) {
e.printStackTrace();
}
sqlSession.commit();
}
可以参考鸿洋大神的博客,设计一个可复用的解析模块:https://blog.csdn.net/lmj623565791/article/details/23272657