如何给女朋友买零食之开发文档
作品一:
程序的目的
本程序是写来参加牛客网的【编程之美02期】如何给女朋友买到心仪的零食,题目是:
用你的代码,来爬取天猫或者京东或者任一电商网站上购买率比较高的零食,来拯救你双十一!可以是淘宝关键字中有女朋友字样的零食,也可以是直接看购买率比较高的零食,亦或是你通过什么途径来获取!总之!真正锻炼动手能力才是真!
结果预览
程序结构说明
1. main.py是调用其它模块实现爬取的入口;
2. SinglePageSpider.py是一个封装了python3的url库的简单单页面爬虫;
3. DataModel.py是数据模型,即负责获取页面、匹配信息、生成结果的。
本程序是在python3环境下的,python2的url库与python3不同,这个是值得注意的~
解决问题
本程序用来对淘宝的零食进行简单的单页面爬取(单个页面上的零食已经够吃了,再不够加多一个url就搞定,反正都是模块化的)。 想分享的主要是解决的思路!
爬取不到淘宝页面?
一开始爬取页面的时候,我用的是最平常的html爬取,但是发现用相应的html正则表达式,居然匹配不到内容??? 反复确认了几次,正则表达式没写错,然后就看了爬下来的页面内容,发现不是最终的html内容,它把商品信息放到javascript里边去了,google了一下,发现基本所有人都说要么用js解析,要么用淘宝提供的api(好像挺多api需要收费的),坑爹啊,真的做不到了吗???
一定要用js去解析吗?
我不想搞那么复杂的东西,比如用pyspider这些自带了解析js的框架,或者浏览器插件等等,花钱去买api就更没劲了~
于是乎,我重新回到用程序获取到的页面内容,分析了一下,发现它其实这个形式更加简单来抓取内容!!!它就是把数据变成纯粹的json数据而已!!!如下所示。
{"i2iTags":{"samestyle":{"url":"/search?type\u003dsamestyle\u0026app\u003di2i\u0026rec_type\u003d1\u0026uniqpid\u003d-165811852\u0026nid\u003d520113726631"},"similar":{"url":"/search?type\u003dsimilar\u0026app\u003di2i\u0026rec_type\u003d1\u0026uniqpid\u003d-165811852\u0026nid\u003d520113726631"}},"p4pTags":[],"nid":"520113726631","category":"50009866","pid":"-165811852","title":"【三只松鼠_猪肉猪肉脯210g】休闲食品小吃\u003cspan class\u003dH\u003e零食\u003c/span\u003e靖江特产猪肉干","raw_title":"【三只松鼠_猪肉猪肉脯210g】休闲食品小吃零食靖江特产猪肉干","pic_url":"//g-search2.alicdn.com/img/bao/uploaded/i4/i2/TB1bTo6KpXXXXb5XVXXXXXXXXXX_!!0-item_pic.jpg","detail_url":"//detail.tmall.com/item.htm?id\u003d520113726631\u0026ns\u003d1\u0026abbucket\u003d0","view_price":"19.90","view_fee":"0.00","item_loc":"安徽 芜湖","reserve_price":"40.00","view_sales":"137017人付款","comment_count":"502145","user_id":"880734502","nick":"三只松鼠旗舰店","shopcard":{"levelClasses":[{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"},{"levelClass":"icon-supple-level-jinguan"}],"isTmall":true,"delivery":[484,1,1637],"description":[491,1,3066],"service":[486,1,1630],"encryptedUserId":"UOmgWMGv0MFNy","sellerCredit":20,"totalRate":10000},"icon":[{"title":"双十一商品","dom_class":"icon-fest-shuangshiyi","position":"1","show_type":"0","icon_category":"baobei","outer_text":"0","html":"","icon_key":"icon-fest-shuangshiyi","trace":"srpservice","traceIdx":6,"innerText":"双十一商品"},{"title":"双11购物券","dom_class":"icon-fest-shuangshiyigouwuquan","position":"1","show_type":"0","icon_category":"baobei","outer_text":"0","html":"","icon_key":"icon-fest-shuangshiyigouwuquan","trace":"srpservice","traceIdx":7,"innerText":"双11购物券"},{"title":"度量单位","dom_class":"icon-service-duliangheng","position":"1","show_type":"0","icon_category":"cat_special","outer_text":"0","html":"\u003cspan class\u003d\"icon-pit icon-service-duliang\"\u003e\u003cb\u003e47.38\u003c/b\u003e元/500g\u003c/span\u003e","icon_key":"icon-service-duliangheng","trace":"srpservice","traceIdx":8,"innerText":"度量单位"},{"title":"尚天猫,就购了","dom_class":"icon-service-tianmao","position":"1","show_type":"0","icon_category":"baobei","outer_text":"0","html":"","icon_key":"icon-service-tianmao","trace":"srpservice","traceIdx":9,"innerText":"天猫宝贝","url":"//www.tmall.com/"}],"comment_url":"//detail.tmall.com/item.htm?id\u003d520113726631\u0026ns\u003d1\u0026abbucket\u003d0\u0026on_comment\u003d1","shopLink":"//store.taobao.com/shop/view_shop.htm?user_number_id\u003d880734502","risk":""}
所以我只需要简单写正则表达式获取即可!甚至还不需要去解析json~
pattern = '"raw_title":"([^"]*)","pic_url":"([^"]*)","detail_url":"[^"]*","view_price":"([^"]*)","view_fee":"[^"]*","item_loc":"[^"]*","reserve_price":"[^"]*","view_sales":"(\d+)人付款","comment_count":"(\d+)"'
上述solution的时效性
注意,这个小技巧只是因为淘宝暂时这样做,所以才行得通,如果后续它用别的方法放数据,就得因时制宜了
jacket,
2016年11月4日 09:54于至善园
作品二 :
var keyword = "d3.js";
//@input(keyword, 查询关键字, 爬取该关键字搜索出来的京东商品)
var comment_count = 100;
//@input(comment_count, 爬取的评论数, 最多爬取多少条评论)
var page_count = comment_count / 10;
keyword = keyword.trim();
var scanUrls = [];
scanUrls.push("http://search.jd.com/Search?keyword=" + keyword.replace(/ /g, "+") + "&enc=utf-8&scrolling=y&page=200");
var helperUrlRegexes = [];
helperUrlRegexes.push("http://search\\.jd\\.com/Search\\?keyword=" + keyword.replace(/ /g, "\\+").replace(/\./g, "\\.") + "&enc=utf-8&scrolling=y&page=\\d+");
var configs = {
domains: ["search.jd.com", "item.jd.com", "club.jd.com"], scanUrls: scanUrls, contentUrlRegexes: ["http://item\\.jd\\.com/\\d+.html"], helperUrlRegexes: helperUrlRegexes, interval: 10000, fields: [ { // 第一个抽取项 name: "title", selector: "//div[@id='name']/h1", required: true }, { // 第一个抽取项 name: "productid", selector: "//div[contains(@class,'fl')]/span[2]", required: true }, { name: "comments", selector: "//div[@id='comment-pages']/span", repeated: true, children: [ { name: "page", selector: "//text()" }, { name: "comments", sourceType: SourceType.AttachedUrl, attachedUrl: "http://club.jd.com/productpage/p-{$.productid}-s-0-t-3-p-{page}.html", selectorType: SelectorType.JsonPath, selector: "$.comments", repeated: true, children:[ { name: "com_content", selectorType: SelectorType.JsonPath, selector: "$.content" }, { name: "com_nickname", selectorType: SelectorType.JsonPath, selector: "$.nickname" } ]
} ]
} ]
};
configs.afterDownloadPage = function(page, site) { var matches = /item\.jd\.com\/(\d+)\.html/.exec(page.url);
if (!matches) return page; var commentUrl = "http://club.jd.com/productpage/p-"+matches[1]+"-s-0-t-3-p-0.html";
var result = site.requestUrl(commentUrl);
var data = JSON.parse(result); var commentCount = data.productCommentSummary.commentCount;
var pages = commentCount / 10;
if (pages > page_count) pages = page_count; var pageHtml = "<div id=\"comment-pages\">";
for (var i = 0; i < pages; i++) { pageHtml += "<span>" + i + "</span>"; } pageHtml += "</div>";
var index = page.raw.indexOf("</body>");
page.raw = page.raw.substring(0, index) + pageHtml + page.raw.substring(index); return page; };
var dataSku = 0; configs.onProcessHelperPage = function(page, content, site) { var num = parseInt(extract(content, "//*[@id='J_goodsList']/ul/li[1]/@data-sku"));
if (dataSku === 0) {
dataSku = isNaN(num) ? 0 : num; }
else if (dataSku === num) {
dataSku = 0; return false; } var currentPageNum = parseInt(page.url.substring(page.url.indexOf("&page=") + 6));
if (currentPageNum === 0) {
currentPageNum = 1;
} var pageNum = currentPageNum + 1; var nextUrl = page.url.replace("&page=" + currentPageNum, "&page=" + pageNum);
site.addUrl(nextUrl); return true; };
configs.afterExtractPage = function(page, data) {
if (data.comments === null || data.comments === undefined)
return data; var comments = [];
for (var i = 0; i < data.comments.length; i++) {
var p = data.comments[i]; for (var j = 0; j < p.comments.length; j++) { comments.push(p.comments[j]);
}
}
data.comments = comments;
return data;
};
var crawler = new Crawler(configs); crawler.start();
//牛妹双11快乐,大家双11快乐(*^▽^*)
作品三:
1. 前言
牛课网在组织一个编程之美的活动, 这次的题目是 http://www.nowcoder.com/discuss/18223?type=0&order=0&pos=2&page=1
正好是使用爬虫进行操作的。就想到使用pyspider写一下了。
pyspider的相关资料:
http://docs.pyspider.org/en/latest/tutorial/
2. 实现流程
2.1 分析网页
我们的目标站点是 https://chi.taobao.com
根据这个信息,我们可以非常方便的解析出相应商品的各种信息, 然而并不是这样, 商品的数据都是通过json异步加载出来的。
从网页源码来看, 根本提取不到任何有用信息
这个是没有加载上json数据时候的基本情况
通过chrome强大的监控功能,我们找到了请求的数据
得到数据源之后, 处理就比较方便了
2.2 参考资料及其记录
- windows 上 pyspider 出现各种莫名其妙的问题, 建议使用 linux
- Python Objects与String之间转换 : http://blog.sina.com.cn/s/blog_4ddef8f80102v8af.html
- mysqldb 使用 http://blog.csdn.net/zhyh1435589631/article/details/51544903
- pyspider 属性 https://pythonhosted.org/pyquery/attributes.html
- mysqldb 安装出错: http://stackoverflow.com/questions/5178292/pip-install-mysql-python-fails-with-environmenterror-mysql-config-not-found
- pyspider 解析json https://segmentfault.com/a/1190000002477870
2.3 数据库相关
2.4 实现代
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-11-05 23:18:55
# Project: taobao_food
from pyspider.libs.base_handler import *
import re
import json
import MySQLdb
class Handler(BaseHandler):
# 数据库链接配置
def __init__(self):
db_host= "127.0.0.1"
user= "root"
passwd="zhyh2010"
db="taobao_food"
charset="utf8"
conn = MySQLdb.connect(host=db_host, user = user, passwd=passwd, db=db, charset=charset)
conn.autocommit(True)
self.db=conn.cursor()
# 爬虫的起始url
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://tce.taobao.com/api/mget.htm?callback=jsonp221&tce_sid=659631&tce_vid=8,2&tid=,&tab=,&topic=,&count=,&env=online,online',
callback=self.json_parser)
# 解析相应的 json 数据
@config(age=24 * 60 * 60)
def select_json(self, response):
content = response.text
pattern = re.compile('window.jsonp.*?\((.*?)\)', re.S)
content_select = re.findall(pattern, content)
return content_select[0].strip()
# 提取相应数据 插入数据库表中
def product_info(self, response):
for data in response["result"]:
res = {
"item_pic": "https:" + data["item_pic"],
"item_youhui_price": data["item_youhui_price"],
"item_title": data["item_title"]
}
sql ="insert into food_info(url, price, title) values (%s,%s,%s)"
values = [(res["item_pic"], res["item_youhui_price"], res["item_title"])]
self.db.executemany(sql, values)
# 解析 json
@config(age=24 * 60 * 60)
def json_parser(self, response):
content = self.select_json(response)
contents = json.loads(content)
subres = contents["result"]
for each in contents["result"]:
info = self.product_info(subres[each])
3. 效果
作品四:
#coding=utf-8
# 测试运行环境: ubuntu 14.04 python 2.7.6 chrome
# 需要安装 selenim chromedriver
from selenium import webdriver
from selenium.webdriver.common.by import By
import sys
GBK = 'gbk'
def logToFile(tag, msg):
logFile = open('log', 'w')
out = tag + '--\n' + msg
logFile.write(out)
def log(tag, msg):
print tag, ' -- '
print msg
def defLog(msg):
log('out', msg)
# 保存零食信息
class Item:
def __init__(self):
self.CODE = 'utf-8'
# 输出内容到markdown文件中
class MarkdownWriter:
def __init__(self, name='out.md'):
mdFile = open(name, 'w')
self.mdFile = mdFile
def writeContent(self, content):
self.mdFile.write(content)
def writeItems(self, title, items):
# 组装markdown格式
content = '### ' + title + ' \n'
for item in items:
content += '#### ' + item.title + ' \n'
content += ' \n'
content += '[goto](' + item.url + ') \n'
content += 'money: ' + item.money + ' \n'
content += 'store: ' + item.store + ' \n'
content += '\n\n'
self.mdFile.write(content)
class TaoBaoSpider:
def __init__(self):
driver = webdriver.Chrome()
self.driver = driver
def getUrl(self, url):
print 'start get ...'
# 通过chrome加载url包括js脚本
self.driver.get(url)
print 'get finished ...'
def getHtmlWithJs(self):
return self.driver.page_source
def getElements(self):
print 'get item ...'
els = self.driver.find_elements(By.CSS_SELECTOR, "li[class=' item item-border'")
return els
def getContent(self, element):
item = Item()
# 从获取的html页面中获取需要的信息并封装到item类中
item.img = element.find_element_by_tag_name('img').get_attribute('src')
item.money = element.find_element(By.CSS_SELECTOR, "div[class='price child-component clearfix'").find_element_by_tag_name('strong').text
titleElement = element.find_element(By.CSS_SELECTOR, "div[class='title child-component'").find_element_by_class_name('J_AtpLog')
item.title = titleElement.text
item.url = titleElement.get_attribute('href')
item.store = element.find_element(By.CSS_SELECTOR, "div[class='seller child-component clearfix'").find_element_by_tag_name('a').text
return item
def start(self, url):
self.url = url
self.getUrl(url)
els = self.getElements()
items = []
for e in els:
item = self.getContent(e)
items.append(item)
return items
def main():
# 设置下编码
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'https://world.taobao.com/search/search.htm?_ksTS=1478358034370_312&spm=a21bp.7806943.20151106.1&search_type=0&_input_charset=utf-8&navigator=all&json=on&q=%E5%A5%B3%E6%9C%8B%E5%8F%8B%20%E9%9B%B6%E9%A3%9F&cna=Eg9NDplivkkCAXuCB323%2Fsy9&callback=__jsonp_cb&abtest=_AB-LR517-LR854-LR895-PR517-PR854-PR895'
# 爬虫运行
spider = TaoBaoSpider()
items = spider.start(url)
# 输出到markdown文件中
writer = MarkdownWriter('taobao.md')
writer.writeItems('零食列表', items)
main()
作品五:
package years.year2016.months11;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import years.year2016.months10.WebUtil;
public class WebDataGain {
public static void main(String[]args){
WebDataGain w = new WebDataGain();
String url = "http://www.biqugezw.com/3_3096/";
String bookname = "一念永恒";
w.downNovel_Biqugezw(url,bookname);
}
/**
* 下载笔趣阁小说功能
* @param url
* @throws IOException
*/
public void downNovel_Biqugezw(String url,String bookName) {
String url_root = "http://www.biqugezw.com";
//用Jsoup连接站点
Document doc=null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
//选择器,选择class做为容器
Elements elementList = doc.select("#list");
String query ="a[href~=/[0-9]{1}_[0-9]{4}/.*html]";
Elements elements = elementList.select(query);
int size = elements.size();
System.out.println(size);
String fileName = "";
int num = 0;
int initnum=371;
for(int i=initnum;i<size;i++){
Element e = elements.get(i);
String href = e.attr("href");
String tempurl = url_root+href;
System.out.println(tempurl);
Document docInner=null;
try {
docInner = Jsoup.connect(tempurl).get();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
System.out.println(fileName);
System.out.println(i);
}
Elements elementsClass = docInner.select(".bookname ");
Elements elementsH = elementsClass.select("h1");
String sectionkname = elementsH.text();
System.out.println(sectionkname);
Elements elementsContent = docInner.select("#content");
String content = elementsContent.text();
System.out.println(content);
num=i%20;
if(num==0&&i==0){
fileName="1-20章";
}else if(num==0&&i!=0){
fileName=i+"-"+(i+20)+"章节";
}else if(i==initnum){
int temp=initnum-num;
fileName = temp+"-"+(temp+20)+"章节";
}
try {
WebUtil.downloadText(sectionkname+" "+content, bookName+"--"+fileName+".txt", WebUtil.getFileDir()+"//book//"+bookName+"//");
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
}
}
作品六 :
| package com.huowolf.service; | |
| import java.io.IOException; | |
| import java.util.Iterator; | |
| import java.util.Set; | |
| import java.util.TreeSet; | |
| import org.jsoup.Jsoup; | |
| import org.jsoup.nodes.Document; | |
| import org.jsoup.nodes.Element; | |
| import org.jsoup.select.Elements; | |
| import com.huowolf.pojo.Product; | |
| public class SpiderService { | |
| /** | |
| * 爬取一个页面的数据,返回得到的商品对象的集合。 | |
| */ | |
| public Set<Product> spiderOnepage(String keyword, int page) { | |
| TreeSet<Product> productSet = new TreeSet<Product>(); | |
| /* | |
| * offset=2代表页面中的按销量排序 | |
| */ | |
| try { | |
| Document doc = Jsoup.connect( | |
| "http://search.jd.com/Search?enc=utf-8qrst=1&rt=1&psort=3&stop=1&vt=2&offset=2&click=0") | |
| .data("keyword", keyword).data("page", String.valueOf(page)).userAgent("Mozilla") | |
| .cookie("auth", "token").get(); | |
| Elements goodsList = doc.select(".gl-item"); | |
| for (Element goodsEle : goodsList) { | |
| // 把解析的内容封装为一个对象 | |
| Product p = new Product(); | |
| // 图片地址出现在src或者data-lazy-img属性上 | |
| String img = goodsEle.select(".p-img a img").attr("src"); | |
| if (img.equals("")) { | |
| img = goodsEle.select(".p-img a img").attr("data-lazy-img"); | |
| } | |
| p.setImage("http:" + img); // 设置图片 | |
| String price = goodsEle.select(".p-price strong").attr("data-price"); | |
| System.out.println(price); | |
| if(!price.equals("0.00")){ | |
| p.setPrice(Double.parseDouble(price)); // 设置价格 | |
| }else{ | |
| System.out.println( goodsEle.select(".p-price").html()); | |
| } | |
| goodsEle.select(".p-name em font").unwrap(); // 先去掉<font class="skcolor_ljg"></font> | |
| String description = goodsEle.select(".p-name em").first().html(); // 可能会出现多个 | |
| p.setDescription(description); // 设置描述 | |
| String commit = goodsEle.select(".p-commit a").first().html(); | |
| commit = commit.substring(0, commit.length() - 1); | |
| long commitPro = 0; | |
| if (commit.contains("万")) { | |
| commit = commit.replaceAll("万", ""); | |
| commitPro = (long) ((Double.parseDouble(commit)) * 10000); | |
| } else { | |
| commitPro = (long) Double.parseDouble(commit); | |
| } | |
| p.setComment(commitPro); // 设置评论量 | |
| productSet.add(p); | |
| } | |
| Iterator<Product> it = productSet.iterator(); | |
| int num = 0; | |
| while (it.hasNext()) { | |
| Product p = it.next(); | |
| num++; | |
| //留下本页销量最高的十条数据 | |
| if (num > 10) { | |
| it.remove(); | |
| } | |
| //System.out.println(p); | |
| } | |
| } catch (IOException e) { | |
| e.printStackTrace(); | |
| } | |
| return productSet; | |
| } | |
| public void spiderData(String keyword){ | |
| TreeSet<Product> productSet = new TreeSet<Product>(); | |
| //爬取代表销量最高的10页数据 | |
| for (int i = 1; i <= 10; i++) { | |
| TreeSet<Product> productTemp = (TreeSet<Product>) spiderOnepage(keyword, i); | |
| System.out.println("-----------------------"+productTemp.size()); | |
| productSet.addAll(productTemp); | |
| //睡眠100毫秒,否则有的价格数据抓取不到。 | |
| try { | |
| Thread.sleep(500); | |
| } catch (InterruptedException e) { | |
| e.printStackTrace(); | |
| } | |
| } | |
| System.out.println("-----------------------"+productSet.size()); | |
| Iterator<Product> it = productSet.iterator(); | |
| int num = 0; | |
| while (it.hasNext()) { | |
| Product p = it.next(); | |
| //num++; | |
| //留下本页销量最高的十条数据 | |
| /* if (num >= 10) { | |
| it.remove(); | |
| }*/ | |
| //System.out.println(p); | |
| } | |
| } | |
|
}
|
作品七:
var webpage = require('webpage')
, page = webpage.create();
page.viewportSize = { width: 1024, height: 800 };
page.clipRect = { top: 0, left: 0, width: 1024, height: 800 };
page.settings = {
javascriptEnabled: false,
loadImages: true,
userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0'
};
page.open('http://search.jd.com/Search?keyword=%E5%A5%B3%E6%9C%8B%E5%8F%8B%E9%9B%B6%E9%A3%9F&enc=utf-8&pvid=u4eabcvi.cns6qn', function (status) {
var data;
if (status === 'fail') {
console.log('open page fail!');
} else {
page.render('./test.png');
console.log('the mirrior page has saved ');
}
// release the memory
page.close();
});


查看5道真题和解析