如何给女朋友买零食之开发文档
作品一:
程序的目的
本程序是写来参加牛客网的【编程之美02期】如何给女朋友买到心仪的零食,题目是:
用你的代码,来爬取天猫或者京东或者任一电商网站上购买率比较高的零食,来拯救你双十一!可以是淘宝关键字中有女朋友字样的零食,也可以是直接看购买率比较高的零食,亦或是你通过什么途径来获取!总之!真正锻炼动手能力才是真!
结果预览
程序结构说明
1. main.py是调用其它模块实现爬取的入口;
2. SinglePageSpider.py是一个封装了python3的url库的简单单页面爬虫;
3. DataModel.py是数据模型,即负责获取页面、匹配信息、生成结果的。
本程序是在python3环境下的,python2的url库与python3不同,这个是值得注意的~
解决问题
本程序用来对淘宝的零食进行简单的单页面爬取(单个页面上的零食已经够吃了,再不够加多一个url就搞定,反正都是模块化的)。 想分享的主要是解决的思路!
爬取不到淘宝页面?
一开始爬取页面的时候,我用的是最平常的html爬取,但是发现用相应的html正则表达式,居然匹配不到内容??? 反复确认了几次,正则表达式没写错,然后就看了爬下来的页面内容,发现不是最终的html内容,它把商品信息放到javascript里边去了,google了一下,发现基本所有人都说要么用js解析,要么用淘宝提供的api(好像挺多api需要收费的),坑爹啊,真的做不到了吗???
一定要用js去解析吗?
我不想搞那么复杂的东西,比如用pyspider这些自带了解析js的框架,或者浏览器插件等等,花钱去买api就更没劲了~
于是乎,我重新回到用程序获取到的页面内容,分析了一下,发现它其实这个形式更加简单来抓取内容!!!它就是把数据变成纯粹的json数据而已!!!如下所示。
{"i2iTags":{"samestyle":{"url":"/search?type\u003dsamestyle\u0026app\u003di2i\u0026rec_type\u003d1\u0026uniqpid\u003d-165811852\u0026nid\u003d520113726631"},"similar":{"url":"/search?type\u003dsimilar\u0026app\u003di2i\u0026rec_type\u003d1\u0026uniqpid\u003d-165811852\u0026nid\u003d520113726631"}},"p4pTags":[],"nid":"520113726631","category":"50009866","pid":"-165811852","title":"【三只松鼠_猪肉猪肉脯210g】休闲食品小吃\u003cspan class\u003dH\u003e零食\u003c/span\u003e靖江特产猪肉干","raw_title":"【三只松鼠_猪肉猪肉脯210g】休闲食品小吃零食靖江特产猪肉干","pic_url":"//g-search2.alicdn.com/img/bao/uploaded/i4/i2/TB1bTo6KpXXXXb5XVXXXXXXXXXX_!!0-item_pic.jpg","detail_url":"//detail.tmall.com/item.htm?id\u003d520113726631\u0026ns\u003d1\u0026abbucket\u003d0","view_price":"19.90","view_fee":"0.00","item_loc":"安徽 芜湖","reserve_price":"40.00","view_sales":"137017人付款","comment_count":"502145","user_id":"880734502","nick":"三只松鼠旗舰店","shopcard":{"levelClasses":[{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"}],"isTmall":true,"delivery":[484,1,1637],"description":[491,1,3066],"service":[486,1,1630],"encryptedUserId":"UOmgWMGv0MFNy","sellerCredit":20,"totalRate":10000},"icon":[{"title":"双十一商品","dom_class":"icon-fest-shuangshiyi","position":"1","show_type":"0","icon_category":"baobei","outer_text":"0","html":"","icon_key":"icon-fest-shuangshiyi","trace":"srpservice","traceIdx":6,"innerText":"双十一商品"},{"title":"双11购物券","dom_class":"icon-fest-shuangshiyigouwuquan","position":"1","show_type":"0","icon_category":"baobei","outer_text":"0","html":"","icon_key":"icon-fest-shuangshiyigouwuquan","trace":"srpservice","traceIdx":7,"innerText":"双11购物券"},{"title":"度量单位","dom_class":"icon-service-duliangheng","position":"1","show_type":"0","icon_category":"cat_special","outer_text":"0","html":"\u003cspan class\u003d\"icon-pit icon-service-duliang\"\u003e\u003cb\u003e47.38\u003c/b\u003e元/500g\u003c/span\u003e","icon_key":"icon-service-duliangheng","trace":"srpservice","traceIdx":8,"innerText":"度量单位"},{"title":"尚天猫,就购了","dom_class":"icon-service-tianmao","position":"1","show_type":"0","icon_category":"baobei","outer_text":"0","html":"","icon_key":"icon-service-tianmao","trace":"srpservice","traceIdx":9,"innerText":"天猫宝贝","url":"//www.tmall.com/"}],"comment_url":"//detail.tmall.com/item.htm?id\u003d520113726631\u0026ns\u003d1\u0026abbucket\u003d0\u0026on_comment\u003d1","shopLink":"//store.taobao.com/shop/view_shop.htm?user_number_id\u003d880734502","risk":""}
所以我只需要简单写正则表达式获取即可!甚至还不需要去解析json~
pattern = '"raw_title":"([^"]*)","pic_url":"([^"]*)","detail_url":"[^"]*","view_price":"([^"]*)","view_fee":"[^"]*","item_loc":"[^"]*","reserve_price":"[^"]*","view_sales":"(\d+)人付款","comment_count":"(\d+)"'
上述solution的时效性
注意,这个小技巧只是因为淘宝暂时这样做,所以才行得通,如果后续它用别的方法放数据,就得因时制宜了
jacket,
2016年11月4日 09:54于至善园
作品二 :
var keyword = "d3.js"; //@input(keyword, 查询关键字, 爬取该关键字搜索出来的京东商品) var comment_count = 100; //@input(comment_count, 爬取的评论数, 最多爬取多少条评论) var page_count = comment_count / 10; keyword = keyword.trim(); var scanUrls = []; scanUrls.push("http://search.jd.com/Search?keyword=" + keyword.replace(/ /g, "+") + "&enc=utf-8&scrolling=y&page=200"); var helperUrlRegexes = []; helperUrlRegexes.push("http://search\\.jd\\.com/Search\\?keyword=" + keyword.replace(/ /g, "\\+").replace(/\./g, "\\.") + "&enc=utf-8&scrolling=y&page=\\d+"); var configs = { domains: ["search.jd.com", "item.jd.com", "club.jd.com"], scanUrls: scanUrls, contentUrlRegexes: ["http://item\\.jd\\.com/\\d+.html"], helperUrlRegexes: helperUrlRegexes, interval: 10000, fields: [ { // 第一个抽取项 name: "title", selector: "//div[@id='name']/h1", required: true }, { // 第一个抽取项 name: "productid", selector: "//div[contains(@class,'fl')]/span[2]", required: true }, { name: "comments", selector: "//div[@id='comment-pages']/span", repeated: true, children: [ { name: "page", selector: "//text()" }, { name: "comments", sourceType: SourceType.AttachedUrl, attachedUrl: "http://club.jd.com/productpage/p-{$.productid}-s-0-t-3-p-{page}.html", selectorType: SelectorType.JsonPath, selector: "$.comments", repeated: true, children:[ { name: "com_content", selectorType: SelectorType.JsonPath, selector: "$.content" }, { name: "com_nickname", selectorType: SelectorType.JsonPath, selector: "$.nickname" } ] } ] } ] }; configs.afterDownloadPage = function(page, site) { var matches = /item\.jd\.com\/(\d+)\.html/.exec(page.url); if (!matches) return page; var commentUrl = "http://club.jd.com/productpage/p-"+matches[1]+"-s-0-t-3-p-0.html"; var result = site.requestUrl(commentUrl); var data = JSON.parse(result); var commentCount = data.productCommentSummary.commentCount; var pages = commentCount / 10; if (pages > page_count) pages = page_count; var pageHtml = "<div id=\"comment-pages\">"; for (var i = 0; i < pages; i++) { pageHtml += "<span>" + i + "</span>"; } pageHtml += "</div>"; var index = page.raw.indexOf("</body>"); page.raw = page.raw.substring(0, index) + pageHtml + page.raw.substring(index); return page; }; var dataSku = 0; configs.onProcessHelperPage = function(page, content, site) { var num = parseInt(extract(content, "//*[@id='J_goodsList']/ul/li[1]/@data-sku")); if (dataSku === 0) { dataSku = isNaN(num) ? 0 : num; } else if (dataSku === num) { dataSku = 0; return false; } var currentPageNum = parseInt(page.url.substring(page.url.indexOf("&page=") + 6)); if (currentPageNum === 0) { currentPageNum = 1; } var pageNum = currentPageNum + 1; var nextUrl = page.url.replace("&page=" + currentPageNum, "&page=" + pageNum); site.addUrl(nextUrl); return true; }; configs.afterExtractPage = function(page, data) { if (data.comments === null || data.comments === undefined) return data; var comments = []; for (var i = 0; i < data.comments.length; i++) { var p = data.comments[i]; for (var j = 0; j < p.comments.length; j++) { comments.push(p.comments[j]); } } data.comments = comments; return data; }; var crawler = new Crawler(configs); crawler.start(); //牛妹双11快乐,大家双11快乐(*^▽^*)
作品三:
1. 前言
牛课网在组织一个编程之美的活动, 这次的题目是 http://www.nowcoder.com/discuss/18223?type=0&order=0&pos=2&page=1
正好是使用爬虫进行操作的。就想到使用pyspider写一下了。
pyspider的相关资料:
http://docs.pyspider.org/en/latest/tutorial/
2. 实现流程
2.1 分析网页
我们的目标站点是 https://chi.taobao.com
根据这个信息,我们可以非常方便的解析出相应商品的各种信息, 然而并不是这样, 商品的数据都是通过json异步加载出来的。
从网页源码来看, 根本提取不到任何有用信息
这个是没有加载上json数据时候的基本情况
通过chrome强大的监控功能,我们找到了请求的数据
得到数据源之后, 处理就比较方便了
2.2 参考资料及其记录
- windows 上 pyspider 出现各种莫名其妙的问题, 建议使用 linux
- Python Objects与String之间转换 : http://blog.sina.com.cn/s/blog_4ddef8f80102v8af.html
- mysqldb 使用 http://blog.csdn.net/zhyh1435589631/article/details/51544903
- pyspider 属性 https://pythonhosted.org/pyquery/attributes.html
- mysqldb 安装出错: http://stackoverflow.com/questions/5178292/pip-install-mysql-python-fails-with-environmenterror-mysql-config-not-found
- pyspider 解析json https://segmentfault.com/a/1190000002477870
2.3 数据库相关
2.4 实现代
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-11-05 23:18:55 # Project: taobao_food from pyspider.libs.base_handler import * import re import json import MySQLdb class Handler(BaseHandler): # 数据库链接配置 def __init__(self): db_host= "127.0.0.1" user= "root" passwd="zhyh2010" db="taobao_food" charset="utf8" conn = MySQLdb.connect(host=db_host, user = user, passwd=passwd, db=db, charset=charset) conn.autocommit(True) self.db=conn.cursor() # 爬虫的起始url @every(minutes=24 * 60) def on_start(self): self.crawl('https://tce.taobao.com/api/mget.htm?callback=jsonp221&tce_sid=659631&tce_vid=8,2&tid=,&tab=,&topic=,&count=,&env=online,online', callback=self.json_parser) # 解析相应的 json 数据 @config(age=24 * 60 * 60) def select_json(self, response): content = response.text pattern = re.compile('window.jsonp.*?\((.*?)\)', re.S) content_select = re.findall(pattern, content) return content_select[0].strip() # 提取相应数据 插入数据库表中 def product_info(self, response): for data in response["result"]: res = { "item_pic": "https:" + data["item_pic"], "item_youhui_price": data["item_youhui_price"], "item_title": data["item_title"] } sql ="insert into food_info(url, price, title) values (%s,%s,%s)" values = [(res["item_pic"], res["item_youhui_price"], res["item_title"])] self.db.executemany(sql, values) # 解析 json @config(age=24 * 60 * 60) def json_parser(self, response): content = self.select_json(response) contents = json.loads(content) subres = contents["result"] for each in contents["result"]: info = self.product_info(subres[each])
3. 效果
作品四:
#coding=utf-8 # 测试运行环境: ubuntu 14.04 python 2.7.6 chrome # 需要安装 selenim chromedriver from selenium import webdriver from selenium.webdriver.common.by import By import sys GBK = 'gbk' def logToFile(tag, msg): logFile = open('log', 'w') out = tag + '--\n' + msg logFile.write(out) def log(tag, msg): print tag, ' -- ' print msg def defLog(msg): log('out', msg) # 保存零食信息 class Item: def __init__(self): self.CODE = 'utf-8' # 输出内容到markdown文件中 class MarkdownWriter: def __init__(self, name='out.md'): mdFile = open(name, 'w') self.mdFile = mdFile def writeContent(self, content): self.mdFile.write(content) def writeItems(self, title, items): # 组装markdown格式 content = '### ' + title + ' \n' for item in items: content += '#### ' + item.title + ' \n' content += '![img](' + item.img + ') \n' content += '[goto](' + item.url + ') \n' content += 'money: ' + item.money + ' \n' content += 'store: ' + item.store + ' \n' content += '\n\n' self.mdFile.write(content) class TaoBaoSpider: def __init__(self): driver = webdriver.Chrome() self.driver = driver def getUrl(self, url): print 'start get ...' # 通过chrome加载url包括js脚本 self.driver.get(url) print 'get finished ...' def getHtmlWithJs(self): return self.driver.page_source def getElements(self): print 'get item ...' els = self.driver.find_elements(By.CSS_SELECTOR, "li[class=' item item-border'") return els def getContent(self, element): item = Item() # 从获取的html页面中获取需要的信息并封装到item类中 item.img = element.find_element_by_tag_name('img').get_attribute('src') item.money = element.find_element(By.CSS_SELECTOR, "div[class='price child-component clearfix'").find_element_by_tag_name('strong').text titleElement = element.find_element(By.CSS_SELECTOR, "div[class='title child-component'").find_element_by_class_name('J_AtpLog') item.title = titleElement.text item.url = titleElement.get_attribute('href') item.store = element.find_element(By.CSS_SELECTOR, "div[class='seller child-component clearfix'").find_element_by_tag_name('a').text return item def start(self, url): self.url = url self.getUrl(url) els = self.getElements() items = [] for e in els: item = self.getContent(e) items.append(item) return items def main(): # 设置下编码 reload(sys) sys.setdefaultencoding('utf-8') url = 'https://world.taobao.com/search/search.htm?_ksTS=1478358034370_312&spm=a21bp.7806943.20151106.1&search_type=0&_input_charset=utf-8&navigator=all&json=on&q=%E5%A5%B3%E6%9C%8B%E5%8F%8B%20%E9%9B%B6%E9%A3%9F&cna=Eg9NDplivkkCAXuCB323%2Fsy9&callback=__jsonp_cb&abtest=_AB-LR517-LR854-LR895-PR517-PR854-PR895' # 爬虫运行 spider = TaoBaoSpider() items = spider.start(url) # 输出到markdown文件中 writer = MarkdownWriter('taobao.md') writer.writeItems('零食列表', items) main()
作品五:
package years.year2016.months11; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import years.year2016.months10.WebUtil; public class WebDataGain { public static void main(String[]args){ WebDataGain w = new WebDataGain(); String url = "http://www.biqugezw.com/3_3096/"; String bookname = "一念永恒"; w.downNovel_Biqugezw(url,bookname); } /** * 下载笔趣阁小说功能 * @param url * @throws IOException */ public void downNovel_Biqugezw(String url,String bookName) { String url_root = "http://www.biqugezw.com"; //用Jsoup连接站点 Document doc=null; try { doc = Jsoup.connect(url).get(); } catch (IOException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } //选择器,选择class做为容器 Elements elementList = doc.select("#list"); String query ="a[href~=/[0-9]{1}_[0-9]{4}/.*html]"; Elements elements = elementList.select(query); int size = elements.size(); System.out.println(size); String fileName = ""; int num = 0; int initnum=371; for(int i=initnum;i<size;i++){ Element e = elements.get(i); String href = e.attr("href"); String tempurl = url_root+href; System.out.println(tempurl); Document docInner=null; try { docInner = Jsoup.connect(tempurl).get(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); System.out.println(fileName); System.out.println(i); } Elements elementsClass = docInner.select(".bookname "); Elements elementsH = elementsClass.select("h1"); String sectionkname = elementsH.text(); System.out.println(sectionkname); Elements elementsContent = docInner.select("#content"); String content = elementsContent.text(); System.out.println(content); num=i%20; if(num==0&&i==0){ fileName="1-20章"; }else if(num==0&&i!=0){ fileName=i+"-"+(i+20)+"章节"; }else if(i==initnum){ int temp=initnum-num; fileName = temp+"-"+(temp+20)+"章节"; } try { WebUtil.downloadText(sectionkname+" "+content, bookName+"--"+fileName+".txt", WebUtil.getFileDir()+"//book//"+bookName+"//"); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } } }
作品六 :
package com.huowolf.service; | |
import java.io.IOException; | |
import java.util.Iterator; | |
import java.util.Set; | |
import java.util.TreeSet; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import com.huowolf.pojo.Product; | |
public class SpiderService { | |
/** | |
* 爬取一个页面的数据,返回得到的商品对象的集合。 | |
*/ | |
public Set<Product> spiderOnepage(String keyword, int page) { | |
TreeSet<Product> productSet = new TreeSet<Product>(); | |
/* | |
* offset=2代表页面中的按销量排序 | |
*/ | |
try { | |
Document doc = Jsoup.connect( | |
"http://search.jd.com/Search?enc=utf-8qrst=1&rt=1&psort=3&stop=1&vt=2&offset=2&click=0") | |
.data("keyword", keyword).data("page", String.valueOf(page)).userAgent("Mozilla") | |
.cookie("auth", "token").get(); | |
Elements goodsList = doc.select(".gl-item"); | |
for (Element goodsEle : goodsList) { | |
// 把解析的内容封装为一个对象 | |
Product p = new Product(); | |
// 图片地址出现在src或者data-lazy-img属性上 | |
String img = goodsEle.select(".p-img a img").attr("src"); | |
if (img.equals("")) { | |
img = goodsEle.select(".p-img a img").attr("data-lazy-img"); | |
} | |
p.setImage("http:" + img); // 设置图片 | |
String price = goodsEle.select(".p-price strong").attr("data-price"); | |
System.out.println(price); | |
if(!price.equals("0.00")){ | |
p.setPrice(Double.parseDouble(price)); // 设置价格 | |
}else{ | |
System.out.println( goodsEle.select(".p-price").html()); | |
} | |
goodsEle.select(".p-name em font").unwrap(); // 先去掉<font class="skcolor_ljg"></font> | |
String description = goodsEle.select(".p-name em").first().html(); // 可能会出现多个 | |
p.setDescription(description); // 设置描述 | |
String commit = goodsEle.select(".p-commit a").first().html(); | |
commit = commit.substring(0, commit.length() - 1); | |
long commitPro = 0; | |
if (commit.contains("万")) { | |
commit = commit.replaceAll("万", ""); | |
commitPro = (long) ((Double.parseDouble(commit)) * 10000); | |
} else { | |
commitPro = (long) Double.parseDouble(commit); | |
} | |
p.setComment(commitPro); // 设置评论量 | |
productSet.add(p); | |
} | |
Iterator<Product> it = productSet.iterator(); | |
int num = 0; | |
while (it.hasNext()) { | |
Product p = it.next(); | |
num++; | |
//留下本页销量最高的十条数据 | |
if (num > 10) { | |
it.remove(); | |
} | |
//System.out.println(p); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return productSet; | |
} | |
public void spiderData(String keyword){ | |
TreeSet<Product> productSet = new TreeSet<Product>(); | |
//爬取代表销量最高的10页数据 | |
for (int i = 1; i <= 10; i++) { | |
TreeSet<Product> productTemp = (TreeSet<Product>) spiderOnepage(keyword, i); | |
System.out.println("-----------------------"+productTemp.size()); | |
productSet.addAll(productTemp); | |
//睡眠100毫秒,否则有的价格数据抓取不到。 | |
try { | |
Thread.sleep(500); | |
} catch (InterruptedException e) { | |
e.printStackTrace(); | |
} | |
} | |
System.out.println("-----------------------"+productSet.size()); | |
Iterator<Product> it = productSet.iterator(); | |
int num = 0; | |
while (it.hasNext()) { | |
Product p = it.next(); | |
//num++; | |
//留下本页销量最高的十条数据 | |
/* if (num >= 10) { | |
it.remove(); | |
}*/ | |
//System.out.println(p); | |
} | |
} | |
}
|
作品七:
var webpage = require('webpage') , page = webpage.create(); page.viewportSize = { width: 1024, height: 800 }; page.clipRect = { top: 0, left: 0, width: 1024, height: 800 }; page.settings = { javascriptEnabled: false, loadImages: true, userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0' }; page.open('http://search.jd.com/Search?keyword=%E5%A5%B3%E6%9C%8B%E5%8F%8B%E9%9B%B6%E9%A3%9F&enc=utf-8&pvid=u4eabcvi.cns6qn', function (status) { var data; if (status === 'fail') { console.log('open page fail!'); } else { page.render('./test.png'); console.log('the mirrior page has saved '); } // release the memory page.close(); });