JobAboutMATLAB

key_word = "MATLAB"
opt = weboptions("Timeout", inf);
tic;
t = [];
for k = 1:17 % matlab 8;python 17
    str = webread("https://xiaoyuan.zhaopin.com/search/jn=2&kw=" + key_word + "&pg=" + k, opt);
    t = [t;extractBetween(str, '{"JobPositionNumber"', '"}', "Boundaries", "inclusive")];
end
RecruitCount = extractBetween(t, 'RecruitCount":"', '"');
CityName = extractBetween(t, '"CityName":"', '"');
CompanyName = extractBetween(t, '"CompanyName":"', '"');
IndustryName = extractBetween(t, '"IndustryName":"', '"');
JobTitle = extractBetween(t, '"JobTitle":"', '"');
link = "https://xiaoyuan.zhaopin.com/job/" + string(extractBetween(t, 'JobPositionNumber":"', '"')) + "?" + string(extractBetween(t, 'Traceurl":"', '"'));
tbl_zhilainzhaopin = table(CityName, RecruitCount, CompanyName, IndustryName, JobTitle, link);
writetable(tbl_zhilainzhaopin, "JobAbout" + key_word + ".xlsx", 'Sheet', "智联招聘")
toc;
tic;
link = [];
CompanyName = [];
CityName = [];
JobTitle = [];
for k = 1:721 % matlab 157;python 721
    str = webread("https://search.51job.com/list/000000,000000,0000,00,9,99," + key_word + ",2,"+ k + ".html", opt);
    city = extractBetween(str, '', '');
    CityName = [CityName;city(2:end)];
    CompanyName = [CompanyName;extractBetween(str, '<a target="_blank" title="', '"')];
    job = extractBetween(str, '<a target="_blank" title="', '"');
    JobTitle = [JobTitle;job(1:2:end)];
%     s = extractBetween(str, 'https://jobs.51job.com/', '.html', "Boundaries", "inclusive");
%     link = [link;s(~contains(s, 'all'))];
end
% str = [];
% opt = weboptions("Timeout", inf);
% missingLink = [];
% for k = 1:length(link)
%     try
%         str = [str;string(webread(string(link(k)), opt))];
%     catch ME
%         disp(k + " error")
%         missingLink = [missingLink;k];
%     end
% end
%
% CityName = extractBetween(str, '"msg ltype" title="', '&nbsp;&nbsp;');
% CompanyName = extractBetween(str, '<p title="', '"');
% IndustryName = extractBetween(str, '');
% IndustryName = IndustryName(:, 3);
% JobTitle = extractBetween(str, '<h1 title="', '"');
% str1 = str;
% str(~contains(str, '|&nbsp;&nbsp;招')) = '|&nbsp;&nbsp;招若干人';
% RecruitCount = extractBetween(str, '|&nbsp;&nbsp;招', '人');
% tbl_51job = table(CityName, RecruitCount, CompanyName, IndustryName, JobTitle, link(setdiff(1:length(link), missingLink)));
tbl_51job = table(CityName, CompanyName, JobTitle);
writetable(tbl_51job, "JobAbout" + key_word + ".xlsx", 'Sheet', "前程无忧")
toc;
tic;
str = [];
for k = 1:100
    str = [str;string(webread("https://www.liepin.com/zhaopin/?init=-1&headckid=1e854e2a7c916901&fromSearchBtn=2&ckid=1e854e2a7c916901&degradeFlag=0&key=" + key_word + "&siTag=Ca-7X5hw55sA8HD9SJUASg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=a4a005bec9744031506223cdc461d55e&d_curPage=5&d_pageSize=40&d_headId=a4a005bec9744031506223cdc461d55e&curPage=" + (k-1),opt))];
end
CompanyName = [];
IndustryName = [];
JobTitle = [];
CityName = [];
for k = 1:100
    CompanyName = [CompanyName;extractBetween(str(k), '<a title="公司', '"     target="_blank"')];
    s = extractBetween(str(k), 'field-financing', '/span');
    s(contains(s, 'div')) = "无<";
    t = strip(extractBetween(s, '', '<'));
    s(~contains(s, 'indus')) = t(strlength(t) > 0);
    s(contains(s, 'indus')) = extractBefore(s(contains(s, 'indus')), 'a>');
    s(contains(s, 'indus')) = extractBetween(s(contains(s, 'indus')), '"_blank">', '</');
    IndustryName = [IndustryName;s];
    JobTitle = [JobTitle;extractBetween(str(k), '<h3 title="招聘', '"')];
    CityName = [CityName;extractBetween(str(k), 'class="area">', '')];
    CityName(contains(CityName, "span")) = extractBefore(CityName(contains(CityName, "span")), "</span");
end
tbl_liepin = table(CityName, CompanyName, IndustryName, JobTitle);
writetable(tbl_liepin, "JobAbout" + key_word + ".xlsx", 'Sheet', "猎聘")
toc

下边是为 caicaibi 爬展览的代码。

opts = weboptions("CharacterEncoding", "UTF-8", "Timeout", inf);
productName = [];
companyName = [];
zhanweihao = [];
companyLink = [];
productCategoty = [];
link = [];
idMissing = [];
for k = 1:8000 % 8000
    productLink = "http://www.cioe.cn/zsml/product_detail_" + k +".html";
    try
        str = webread(productLink, opts);
    catch ME
        idMissing = [idMissing;link];
        disp("missing " + k)
        continue
    end
    if contains(str, "Sorry")
        continue;
    end
    if ~mod(k, 100)
        disp(k)
    end
    link = [link;productLink];
    productCategoty = [productCategoty;extractBetween(str, "行业类别:", "</p>")];
    zhanweihao = [zhanweihao;extractBetween(str, "展位号:", "</p>")];
    productName = [productName;extractBefore(extractBetween(str, "<title>", "</title>"), '-')];
    tmp = extractBetween(str, 'html">', '</a>');
    companyName = [companyName;tmp(end-1)];
    companyLink = [companyLink;"http://www.cioe.cn/zsml/bull_detail_" + unique(extractBetween(str, "bull_detail_", ".html")) + ".html"];

end
tt = table(productName,link, productCategoty, zhanweihao, companyName,companyLink);
writetable(tt, "information.xlsx")
全部评论

相关推荐

牛客722552937号:新锐之星有点坑爹,特别是对男的
点赞 评论 收藏
分享
点赞 评论 收藏
分享
点赞 收藏 评论
分享
牛客网
牛客企业服务