爬虫第13节

爬取中国大学排名实例

import requests
from bs4 import BeautifulSoup
import bs4

import re


def getHTMLText(url):
    try:
        kv = {"user-agent":"Mozilla/5.0"}
        r = requests.get(url,headers=kv,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def fillUnivList(ulist,html):
    soup = BeautifulSoup(html,'html.parser')
    tbody = soup.find('tbody')
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr.find_all('td')
            #print(tds)


            # st0 = getTagText(str(tds[0]))

            st0 = re.sub(r"<!--.*-->", "", str(tds[0]))
            st0 = re.sub(r"</td>","",st0)
            st0 = re.sub(r"<.*>","",st0)
            st0 = re.sub(r" ","",st0)
            st0 = re.sub("\n","",st0)


            #st1 = getTagText(str(tds[1]))

            aas =tds[1].find_all('a')
            st1 = aas[0].string
            st1 = re.sub(r" ","",st1)
            st1 = re.sub("\n","",st1)

            #st4 = getTagText(str(tds[4]))
            st4 = re.sub(r"<!--.*-->", "", str(tds[4]))
            st4 = re.sub(r"</td>","",st4)
            st4 = re.sub(r"<.*>","",st4)
            st4 = re.sub(r" ","",st4)
            st4 = re.sub("\n","",st4)

            #print(st0,end = ",")
            #print(st1,end = ",")
            #print(st4,end = ",")
            #print(tds[1])
            #st0 = re.sub(r"<.*>","",str(tds[0]))
            #st1 = re.sub(r"<.*>","",str(tds[1]))
            #st4 = re.sub(r"<.*>","",str(tds[4]))
            #print(st0)
            #print(st1)
            #print(st4)


            ulist.append([st0 , st1 , st4 ] ) ###!!!!!!!!出错

def printUnivList(ulist,num):
    tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" #详情去看.format的用法
    print(tplt.format("排名","学校名称","总分",chr(12288)))
    for i in range(num):
        u=ulist[i]
        print(tplt.format(u[0],u[1],u[2],chr(12288)))

def main():
    uinfo = []
    url = "https://www.shanghairanking.cn/rankings/bcur/2020"    
    html = getHTMLText(url)
    fillUnivList(uinfo,html) 
    printUnivList(uinfo, 20)
main()

其中与慕课上稍有变动,由于在td标签中加入了,因此不能直接用.string获得标签中的信息,所以将标签转化为字符串类型,然后运用re.sub()将字符串进行处理,最后得到所需要的结果。

全部评论

相关推荐

菜鸡29号:根据已有信息能初步得出以下几点: 1、硕士排了大本和大专 2、要求会多语言要么是招人很挑剔要么就是干的活杂 3、给出校招薪资范围过于巨大,说明里面的薪资制度(包括涨薪)可能有大坑
点赞 评论 收藏
分享
kl_我是东山啊:《相关公司:阿里巴巴》
投递阿里巴巴等公司10个岗位
点赞 评论 收藏
分享
评论
点赞
收藏
分享

创作者周榜

更多
牛客网
牛客企业服务