Python抓取华中大二手市场商品信息
最近做项目需要获取一些商品信息,于是就写了个简单的脚本来抓取某电子商场。基本原理是发送request请求然后分析response文本信息,正则匹配想要的内容。
#coding=utf-8
#time:2014/4/29
#author:Li
#OS:windows
import requests
import re
import os
def catch_ershou():
'''抓取华中大二手市场'''
host_url="https://ershou.hustonline.net"
#add all the index ,total is 21 pages
index_url=[]
for i in range(1,22):
index_url.append(host_url+'/index/index/'+str(i)+'/all')
#find all the goods detail page links
links=[]
for url in index_url:
req=requests.get(url)
req.encoding='utf-8'
regex=re.compile(r'ui-link-img[^>]+?href="(/goods/details/.+?)"')
for link in regex.findall(req.content):
links.append(host_url+link)
#catch all the goods informations
good_arr=[]
for link in links:
print link
good_info={"name":"无","price":"无","addr":"无","time":"无","Tags":"无","contact":"无","QQ":"无"}
req=requests.get(link)
req.encoding='utf-8'
try:
regex=re.compile(r'stock-info-name.+?>(.+?)</h3>')
info=regex.search(req.content).groups()
good_info.update({"name":info[0].strip()})
regex=re.compile(r'stock-price.+?>(.+?)</span>')
info=regex.search(req.content).groups()
good_info.update({"price":info[0].strip()})
regex=re.compile(r'stock-info-attr.+?>([^<]+?)</div>')
info=regex.findall(req.content)
good_info.update({"addr":info[0].strip(),"time":info[1].strip(),"Tags":info[2].strip(),"contact":info[3].strip(),"QQ":info[4].strip()})
except Exception:
pass
good_arr.append(good_info)
print "total links:"+len(links)
#write reasult in file
try:
fp=open(os.getcwd()+'\\ershou.txt','a+')
for good_info in good_arr:
fp.write('{"名称":'+'"'+good_info["name"]+'",'+\
'"价格":'+'"'+good_info["price"]+'",'+\
'"交易地点":'+'"'+good_info["addr"]+'",'+\
'"发布时间":'+'"'+good_info["time"]+'",'+\
'"Tags":'+'"'+good_info["Tags"]+'",'+\
'"联系人":'+'"'+good_info["contact"]+'",'+\
'"QQ":'+'"'+good_info['QQ']+'"'+\
"}\r\n")
fp.close()
except Exception:
print "write reasult in file failed!"
print "all is done..."
def main():
catch_ershou()
if __name__ == '__main__':
main()
最后更新:2017-04-03 12:56:29