JobAboutMATLAB
key_word = "MATLAB" opt = weboptions("Timeout", inf); tic; t = []; for k = 1:17 % matlab 8;python 17 str = webread("https://xiaoyuan.zhaopin.com/search/jn=2&kw=" + key_word + "&pg=" + k, opt); t = [t;extractBetween(str, '{"JobPositionNumber"', '"}', "Boundaries", "inclusive")]; end RecruitCount = extractBetween(t, 'RecruitCount":"', '"'); CityName = extractBetween(t, '"CityName":"', '"'); CompanyName = extractBetween(t, '"CompanyName":"', '"'); IndustryName = extractBetween(t, '"IndustryName":"', '"'); JobTitle = extractBetween(t, '"JobTitle":"', '"'); link = "https://xiaoyuan.zhaopin.com/job/" + string(extractBetween(t, 'JobPositionNumber":"', '"')) + "?" + string(extractBetween(t, 'Traceurl":"', '"')); tbl_zhilainzhaopin = table(CityName, RecruitCount, CompanyName, IndustryName, JobTitle, link); writetable(tbl_zhilainzhaopin, "JobAbout" + key_word + ".xlsx", 'Sheet', "智联招聘") toc; tic; link = []; CompanyName = []; CityName = []; JobTitle = []; for k = 1:721 % matlab 157;python 721 str = webread("https://search.51job.com/list/000000,000000,0000,00,9,99," + key_word + ",2,"+ k + ".html", opt); city = extractBetween(str, '', ''); CityName = [CityName;city(2:end)]; CompanyName = [CompanyName;extractBetween(str, '<a target="_blank" title="', '"')]; job = extractBetween(str, '<a target="_blank" title="', '"'); JobTitle = [JobTitle;job(1:2:end)]; % s = extractBetween(str, 'https://jobs.51job.com/', '.html', "Boundaries", "inclusive"); % link = [link;s(~contains(s, 'all'))]; end % str = []; % opt = weboptions("Timeout", inf); % missingLink = []; % for k = 1:length(link) % try % str = [str;string(webread(string(link(k)), opt))]; % catch ME % disp(k + " error") % missingLink = [missingLink;k]; % end % end % % CityName = extractBetween(str, '"msg ltype" title="', ' '); % CompanyName = extractBetween(str, '<p title="', '"'); % IndustryName = extractBetween(str, ''); % IndustryName = IndustryName(:, 3); % JobTitle = extractBetween(str, '<h1 title="', '"'); % str1 = str; % str(~contains(str, '| 招')) = '| 招若干人'; % RecruitCount = extractBetween(str, '| 招', '人'); % tbl_51job = table(CityName, RecruitCount, CompanyName, IndustryName, JobTitle, link(setdiff(1:length(link), missingLink))); tbl_51job = table(CityName, CompanyName, JobTitle); writetable(tbl_51job, "JobAbout" + key_word + ".xlsx", 'Sheet', "前程无忧") toc; tic; str = []; for k = 1:100 str = [str;string(webread("https://www.liepin.com/zhaopin/?init=-1&headckid=1e854e2a7c916901&fromSearchBtn=2&ckid=1e854e2a7c916901°radeFlag=0&key=" + key_word + "&siTag=Ca-7X5hw55sA8HD9SJUASg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=a4a005bec9744031506223cdc461d55e&d_curPage=5&d_pageSize=40&d_headId=a4a005bec9744031506223cdc461d55e&curPage=" + (k-1),opt))]; end CompanyName = []; IndustryName = []; JobTitle = []; CityName = []; for k = 1:100 CompanyName = [CompanyName;extractBetween(str(k), '<a title="公司', '" target="_blank"')]; s = extractBetween(str(k), 'field-financing', '/span'); s(contains(s, 'div')) = "无<"; t = strip(extractBetween(s, '', '<')); s(~contains(s, 'indus')) = t(strlength(t) > 0); s(contains(s, 'indus')) = extractBefore(s(contains(s, 'indus')), 'a>'); s(contains(s, 'indus')) = extractBetween(s(contains(s, 'indus')), '"_blank">', '</'); IndustryName = [IndustryName;s]; JobTitle = [JobTitle;extractBetween(str(k), '<h3 title="招聘', '"')]; CityName = [CityName;extractBetween(str(k), 'class="area">', '')]; CityName(contains(CityName, "span")) = extractBefore(CityName(contains(CityName, "span")), "</span"); end tbl_liepin = table(CityName, CompanyName, IndustryName, JobTitle); writetable(tbl_liepin, "JobAbout" + key_word + ".xlsx", 'Sheet', "猎聘") toc
下边是为 caicaibi 爬展览的代码。
opts = weboptions("CharacterEncoding", "UTF-8", "Timeout", inf); productName = []; companyName = []; zhanweihao = []; companyLink = []; productCategoty = []; link = []; idMissing = []; for k = 1:8000 % 8000 productLink = "http://www.cioe.cn/zsml/product_detail_" + k +".html"; try str = webread(productLink, opts); catch ME idMissing = [idMissing;link]; disp("missing " + k) continue end if contains(str, "Sorry") continue; end if ~mod(k, 100) disp(k) end link = [link;productLink]; productCategoty = [productCategoty;extractBetween(str, "行业类别:", "</p>")]; zhanweihao = [zhanweihao;extractBetween(str, "展位号:", "</p>")]; productName = [productName;extractBefore(extractBetween(str, "<title>", "</title>"), '-')]; tmp = extractBetween(str, 'html">', '</a>'); companyName = [companyName;tmp(end-1)]; companyLink = [companyLink;"http://www.cioe.cn/zsml/bull_detail_" + unique(extractBetween(str, "bull_detail_", ".html")) + ".html"]; end tt = table(productName,link, productCategoty, zhanweihao, companyName,companyLink); writetable(tt, "information.xlsx")