新浪微博关键词结果爬取 实现过程 记录
借鉴 https://blog.csdn.net/u010454729/article/details/51137153
把代码
import cookielib
import base64
import re
import json
import hashlib
import rsa
import binascii
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
pubkey = 'EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
postdata = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'userticket': '1',
'ssosimplelogin': '1',
'vsnf': '1',
#'vsnval': '',
'su': '',
'service': 'miniblog',
'servertime': '',
'nonce': '',
#'pwencode': 'wsse',
'pwencode': 'rsa2',
'sp': '',
'pagerefer':'https://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F',
'raskv':'',
'sr':'1440*900',
'prelt':'94',
'encoding': 'UTF-8',
'url': 'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META'
}
def get_servertime():
url = 'https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939'
data = urllib2.urlopen(url).read()
p = re.compile('(.∗)')
try:
json_data = p.search(data).group(1)
data = json.loads(json_data)
servertime = str(data['servertime'])
nonce = data['nonce']
rsakv=data['rsakv']
return servertime, nonce,rsakv
except:
print 'Get severtime error!'
return None
def get_pwd(pwd, servertime, nonce):
#pwd1 = hashlib.sha1(pwd).hexdigest()#旧的加密方式,pwencode的值需要为wsse
#pwd2 = hashlib.sha1(pwd1).hexdigest()
#pwd3_ = pwd2 + servertime + nonce
#pwd3 = hashlib.sha1(pwd3_).hexdigest()
#return passwd
global pubkey
rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥
message = str(servertime) + '\t' + str(nonce) + '\n' + str(pwd) #拼接明文js加密文件中得到
passwd = rsa.encrypt(message, key) #加密
passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制。
print passwd
return passwd
def get_user(username):
username_ = urllib.quote(username)
username = base64.encodestring(username_)[:-1]
return username
def login():
username = '你的登录邮箱'
pwd = '你的密码'
url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
try:
servertime, nonce,rsakv = get_servertime()
except:
return
global postdata
postdata['servertime'] = servertime
postdata['nonce'] = nonce
postdata['rsakv'] = rsakv
postdata['su'] = get_user(username)
postdata['sp'] = get_pwd(pwd, servertime, nonce)
for i in postdata:
print i,":", postdata[i]#可以看提交的哪些数据
# print postdata["su"]
# print postdata["sp"]
postdata = urllib.urlencode(postdata)
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0'}
req = urllib2.Request(
url = url,
data = postdata,
headers = headers
)
result = urllib2.urlopen(req)
text = result.read()
#print len(text)
#print 30*"*"
#print text
#p = re.compile('location\.replace\'(.∗?)\'')#p = re.compile('location\.replace\'(.∗?)\'')
p = re.compile('location\.replace\"(.∗?)\"')
login_url = p.findall(text)[0]
#login_url = p.search(text).group(1)
print login_url
print 30*"*"
try:
result1 =urllib2.urlopen(login_url).read()
fres = open("fres.txt","w")
fres.write(result1)
fres.close()
# print len(result1)
# print type(result1)
print u"登录成功!"#是否登录成功还要看result1的内容是否是别的什么,
query = "深圳禁摩限电"
page_num = 1
url = "https://s.weibo.com/weibo/{0}&b=1&nodup=1&page={1}".format(query, page_num)
#https://s.weibo.com/weibo/深圳禁摩限电&b=1&nodup=1&page=1
print url.encode("gbk")
#newurl = "https://baike.baidu.com"+suburl
req = urllib2.Request(url)
html = urllib2.urlopen(req).read()
#print html
# html = urllib2.urlopen(url)
# req = urllib2.urlparse()
#soup = bs(html)
f_query1 = open("shenzhenlimitcar_sf.html","w")
f_query1.write(html)
f_query1.close()
except Exception,e:
print 'Login error!'
print e
if __name__=="__main__":
login()
放进sublime,用python3运行,发现包的兼容问题有点多,猜测原作者是在python2.7环境下运行的,切换成2.7,再次**cmd+b**
第一个报错:
SyntaxError: Non-ASCII character '\xe2' in file /Users/Lily/储存/xinlang_py27.py on line 41, but no encoding declared;
see https://python.org/dev/peps/pep-0263/ for details
解决方法:
在文件头追加:
# -*- coding: utf-8 -*
第二个报错:
rsakv=data['rsakv']
^
SyntaxError: invalid syntax
解决方法:
缩进问题,前面加4个空格(1个tab);
第三个报错:
print i,":", postdata[i]#可以看提交的哪些数据
^
IndentationError: expected an indented block
解决方法:
也是缩进问题;
第四个报错:
import rsa
ImportError: No module named rsa
解决方法:
到终端,
pip install rsa
报错:
Exception:
Traceback (most recent call last):
File "/Library/Python/2.7/site-packages/pip/basecommand.py", line 215, in main
status = self.run(options, args)
File "/Library/Python/2.7/site-packages/pip/commands/install.py", line 342, in run
prefix=options.prefix_path,
File "/Library/Python/2.7/site-packages/pip/req/req_set.py", line 784, in install
**kwargs
File "/Library/Python/2.7/site-packages/pip/req/req_install.py", line 851, in install
self.move_wheel_files(self.source_dir, root=root, prefix=prefix)
File "/Library/Python/2.7/site-packages/pip/req/req_install.py", line 1064, in move_wheel_files
isolated=self.isolated,
File "/Library/Python/2.7/site-packages/pip/wheel.py", line 345, in move_wheel_files
clobber(source, lib_dir, True)
File "/Library/Python/2.7/site-packages/pip/wheel.py", line 316, in clobber
ensure_dir(destdir)
File "/Library/Python/2.7/site-packages/pip/utils/__init__.py", line 83, in ensure_dir
os.makedirs(path)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/os.py", line 157, in makedirs
mkdir(name, mode)
OSError: [Errno 13] Permission denied: '/Library/Python/2.7/site-packages/pyasn1'
解决方法:
sudo pip install rsa
在shell里显示
Collecting rsa
Downloading rsa-3.4.2-py2.py3-none-any.whl (46kB)
100% |████████████████████████████████| 51kB 126kB/s
Collecting pyasn1>=0.1.3 (from rsa)
Downloading pyasn1-0.3.7-py2.py3-none-any.whl (63kB)
100% |████████████████████████████████| 71kB 32kB/s
Installing collected packages: pyasn1, rsa
Successfully installed pyasn1-0.3.7 rsa-3.4.2
但运行sublime还是显示没有rsa包;
在shell里分别
pip list
和
pip3 list
发现果然rsa被装到python3上了;
解决方法1:
找到一个名为 rsa-3.1.1-py2.7.egg 的文件,感觉可以把rsa包直接安装到python2.7,但没找到方法安装此egg文件,方法1暂停尝试;
解决方法2:
201710062219
最后更新:2017-10-07 18:33:08