閱讀834 返回首頁    go 京東網上商城


Python抓取華中大二手市場商品信息

最近做項目需要獲取一些商品信息,於是就寫了個簡單的腳本來抓取某電子商場。基本原理是發送request請求然後分析response文本信息,正則匹配想要的內容。

#coding=utf-8
#time:2014/4/29
#author:Li
#OS:windows
import requests
import re
import os

def catch_ershou():
	'''抓取華中大二手市場'''
	host_url="https://ershou.hustonline.net"
	#add all the index ,total is 21 pages
	index_url=[]
	for i in range(1,22):
		index_url.append(host_url+'/index/index/'+str(i)+'/all')

	#find all the goods detail page links
	links=[]
	for url in index_url:
		req=requests.get(url)
		req.encoding='utf-8'
		regex=re.compile(r'ui-link-img[^>]+?href="(/goods/details/.+?)"')
		for link in regex.findall(req.content):
			links.append(host_url+link)

	#catch all the goods informations
	good_arr=[]
	for link in links:
		print link
		good_info={"name":"無","price":"無","addr":"無","time":"無","Tags":"無","contact":"無","QQ":"無"}
		req=requests.get(link)
		req.encoding='utf-8'
		try:
			regex=re.compile(r'stock-info-name.+?>(.+?)</h3>')
			info=regex.search(req.content).groups()
			good_info.update({"name":info[0].strip()})

			regex=re.compile(r'stock-price.+?>(.+?)</span>')
			info=regex.search(req.content).groups()
			good_info.update({"price":info[0].strip()})

			regex=re.compile(r'stock-info-attr.+?>([^<]+?)</div>')
			info=regex.findall(req.content)
			good_info.update({"addr":info[0].strip(),"time":info[1].strip(),"Tags":info[2].strip(),"contact":info[3].strip(),"QQ":info[4].strip()})
		except Exception:
			pass	
		good_arr.append(good_info)
	print "total links:"+len(links)	
	#write reasult in file
	try:
		fp=open(os.getcwd()+'\\ershou.txt','a+')
		for good_info in good_arr:
			fp.write('{"名稱":'+'"'+good_info["name"]+'",'+\
						'"價格":'+'"'+good_info["price"]+'",'+\
						'"交易地點":'+'"'+good_info["addr"]+'",'+\
						'"發布時間":'+'"'+good_info["time"]+'",'+\
						'"Tags":'+'"'+good_info["Tags"]+'",'+\
						'"聯係人":'+'"'+good_info["contact"]+'",'+\
						'"QQ":'+'"'+good_info['QQ']+'"'+\
					"}\r\n")
		fp.close()
	except Exception:
		print "write reasult in file failed!"
	print "all is done..."

def main():
	catch_ershou()


if __name__ == '__main__':
	main()


最後更新:2017-04-03 12:56:29

  上一篇:go regsvr32 注冊.dll的用法
  下一篇:go VM啟動報錯Cannot open the disk,Failed to lock the file