Python抓取華中大二手市場商品信息
最近做項目需要獲取一些商品信息,於是就寫了個簡單的腳本來抓取某電子商場。基本原理是發送request請求然後分析response文本信息,正則匹配想要的內容。
#coding=utf-8
#time:2014/4/29
#author:Li
#OS:windows
import requests
import re
import os
def catch_ershou():
'''抓取華中大二手市場'''
host_url="https://ershou.hustonline.net"
#add all the index ,total is 21 pages
index_url=[]
for i in range(1,22):
index_url.append(host_url+'/index/index/'+str(i)+'/all')
#find all the goods detail page links
links=[]
for url in index_url:
req=requests.get(url)
req.encoding='utf-8'
regex=re.compile(r'ui-link-img[^>]+?href="(/goods/details/.+?)"')
for link in regex.findall(req.content):
links.append(host_url+link)
#catch all the goods informations
good_arr=[]
for link in links:
print link
good_info={"name":"無","price":"無","addr":"無","time":"無","Tags":"無","contact":"無","QQ":"無"}
req=requests.get(link)
req.encoding='utf-8'
try:
regex=re.compile(r'stock-info-name.+?>(.+?)</h3>')
info=regex.search(req.content).groups()
good_info.update({"name":info[0].strip()})
regex=re.compile(r'stock-price.+?>(.+?)</span>')
info=regex.search(req.content).groups()
good_info.update({"price":info[0].strip()})
regex=re.compile(r'stock-info-attr.+?>([^<]+?)</div>')
info=regex.findall(req.content)
good_info.update({"addr":info[0].strip(),"time":info[1].strip(),"Tags":info[2].strip(),"contact":info[3].strip(),"QQ":info[4].strip()})
except Exception:
pass
good_arr.append(good_info)
print "total links:"+len(links)
#write reasult in file
try:
fp=open(os.getcwd()+'\\ershou.txt','a+')
for good_info in good_arr:
fp.write('{"名稱":'+'"'+good_info["name"]+'",'+\
'"價格":'+'"'+good_info["price"]+'",'+\
'"交易地點":'+'"'+good_info["addr"]+'",'+\
'"發布時間":'+'"'+good_info["time"]+'",'+\
'"Tags":'+'"'+good_info["Tags"]+'",'+\
'"聯係人":'+'"'+good_info["contact"]+'",'+\
'"QQ":'+'"'+good_info['QQ']+'"'+\
"}\r\n")
fp.close()
except Exception:
print "write reasult in file failed!"
print "all is done..."
def main():
catch_ershou()
if __name__ == '__main__':
main()
最後更新:2017-04-03 12:56:29