834
京東網上商城
Python抓取華中大二手市場商品信息
最近做項目需要獲取一些商品信息,於是就寫了個簡單的腳本來抓取某電子商場。基本原理是發送request請求然後分析response文本信息,正則匹配想要的內容。
#coding=utf-8 #time:2014/4/29 #author:Li #OS:windows import requests import re import os def catch_ershou(): '''抓取華中大二手市場''' host_url="https://ershou.hustonline.net" #add all the index ,total is 21 pages index_url=[] for i in range(1,22): index_url.append(host_url+'/index/index/'+str(i)+'/all') #find all the goods detail page links links=[] for url in index_url: req=requests.get(url) req.encoding='utf-8' regex=re.compile(r'ui-link-img[^>]+?href="(/goods/details/.+?)"') for link in regex.findall(req.content): links.append(host_url+link) #catch all the goods informations good_arr=[] for link in links: print link good_info={"name":"無","price":"無","addr":"無","time":"無","Tags":"無","contact":"無","QQ":"無"} req=requests.get(link) req.encoding='utf-8' try: regex=re.compile(r'stock-info-name.+?>(.+?)</h3>') info=regex.search(req.content).groups() good_info.update({"name":info[0].strip()}) regex=re.compile(r'stock-price.+?>(.+?)</span>') info=regex.search(req.content).groups() good_info.update({"price":info[0].strip()}) regex=re.compile(r'stock-info-attr.+?>([^<]+?)</div>') info=regex.findall(req.content) good_info.update({"addr":info[0].strip(),"time":info[1].strip(),"Tags":info[2].strip(),"contact":info[3].strip(),"QQ":info[4].strip()}) except Exception: pass good_arr.append(good_info) print "total links:"+len(links) #write reasult in file try: fp=open(os.getcwd()+'\\ershou.txt','a+') for good_info in good_arr: fp.write('{"名稱":'+'"'+good_info["name"]+'",'+\ '"價格":'+'"'+good_info["price"]+'",'+\ '"交易地點":'+'"'+good_info["addr"]+'",'+\ '"發布時間":'+'"'+good_info["time"]+'",'+\ '"Tags":'+'"'+good_info["Tags"]+'",'+\ '"聯係人":'+'"'+good_info["contact"]+'",'+\ '"QQ":'+'"'+good_info['QQ']+'"'+\ "}\r\n") fp.close() except Exception: print "write reasult in file failed!" print "all is done..." def main(): catch_ershou() if __name__ == '__main__': main()
最後更新:2017-04-03 12:56:29