13. 网络爬虫案例实战1

  • 本次案例是通过登录人人网,抓取登录后用户中心的信息

1. 模拟人人登录请求,执行登录验证操作

from urllib import request,parse

login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018321648829'
data = {
    'email':'1352*****6',
    'icode':'',
    'origURL':'http://www.renren.com/home',
    'domain':'renren.com',
    'key_id':'1',
    'captcha_type':'web_login',
    'password':'478b7c2dca554eeabed3b7374703bff4a6a22e78b8a9fcfb090e3a7fb792992b',
    'rkey':'e954ec64a7ecf4e33bdf81bb1abad158',
    'f':'http%3A%2F%2Fwww.renren.com%2F965541786',
}
data = parse.urlencode(data)

headers = {
    'Content-Length' : len(data)
}
req = request.Request(url=login_url,data=bytes(data,encoding='utf-8'),headers=headers,)
response = request.urlopen(req)
print(response.read().decode('utf-8'))

2. 抓取登录成功后的用户home页信息

from urllib import request
import re,gzip

base_url = 'http://www.renren.com/965541786'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': 'anonymid=jgdcqcjgqy4yxt; depovince=BJ; _r01_=1; JSESSIONID=abc7HUv9M_HsB7WkgK2lw; ick_login=b954cf62-bbe5-480d-b679-e1e3ce584896; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; ick=b4337770-b7ce-4a70-b9d0-cd63c7fc7bb5; XNESSESSIONID=738f4bde312f; jebe_key=f950add1-40e8-4009-a157-bfc3d89f7350%7C24a48cb369f8637c5ee2c4a23eb5b93f%7C1524555370510%7C1%7C1524555375485; first_login_flag=1; ln_uact=13520319616; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; wp_fold=0; jebecookies=ef7f7372-0e70-45db-aaae-c415d4611918|||||; _de=8C2F648D7158ED727318288C8F3F21C5; p=f1ea4b6984cefb7d88164a67816c91fe6; t=401516286d37bde6735180d25f68f2fe6; societyguester=401516286d37bde6735180d25f68f2fe6; id=965541786; xnsid=928c27b; ver=7.0; loginfrom=null',
    'Host': 'www.renren.com',
    'Referer': 'http://www.renren.com/SysHome.do',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}

req = request.Request(url=base_url,headers=headers)
res = request.urlopen(req)

#html = res.read().decode("utf-8") # 网页响应时开启了gzip压缩,需要解压
#报错:UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
# 获取将请求头中的Accept-Encoding的gzip删除也可以

# 对gzip压缩的响应,我们解压后转码
html = gzip.decompress(res.read()).decode("utf-8")
#print(html)
print(re.findall("<title>(.*?)</title>",html))

3. 使用cookiejar将上面两个合并到一起执行

from urllib import request,parse
import re,gzip,time

# cookie管理模块,
from http import cookiejar
# 返回存储cookie对象
cookie = cookiejar.CookieJar()
# 返回一个cookie管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
# 请求管理器
opener = request.build_opener(cookie_handler)

def doLogin():
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018321648829'
    data = {
        'email':'1352*****16',
        'icode':'',
        'origURL':'http://www.renren.com/home',
        'domain':'renren.com',
        'key_id':'1',
        'captcha_type':'web_login',
        'password':'478b7c2dca554eeabed3b7374703bff4a6a22e78b8a9fcfb090e3a7fb792992b',
        'rkey':'e954ec64a7ecf4e33bdf81bb1abad158',
        'f':'http%3A%2F%2Fwww.renren.com%2F965541786',
    }
    data = parse.urlencode(data)

    headers = {
        'Content-Length' : len(data)
    }
    req = request.Request(url=login_url,data=bytes(data,encoding='utf-8'),headers=headers,)
    response = opener.open(req)

def myHome():
    home_url = 'http://www.renren.com/965541786'
    res = opener.open(home_url)
    html = res.read().decode("utf-8") 
    #print(html)
    print(re.findall("<title>(.*?)</title>",html))

if __name__ == '__main__':
    # 登陆
    print("正在登录中...")
    doLogin()
    time.sleep(3)

    # 访问个人首页
    myHome()

4. 使用requests重写第三步的代码,实现人人网登录并抓取登录后信息

import requests
import re,time

s = requests.Session()

def doLogin():
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018321648829'
    data = {
        'email':'1352*****6',
        'icode':'',
        'origURL':'http://www.renren.com/home',
        'domain':'renren.com',
        'key_id':'1',
        'captcha_type':'web_login',
        'password':'478b7c2dca554eeabed3b7374703bff4a6a22e78b8a9fcfb090e3a7fb792992b',
        'rkey':'e954ec64a7ecf4e33bdf81bb1abad158',
        'f':'http%3A%2F%2Fwww.renren.com%2F965541786',
    }

    s.post(login_url,data=data)

def myHome():
    home_url = 'http://www.renren.com/965541786'
    res = s.get(home_url)
    html = res.content.decode("utf-8") 
    #print(html)
    print(re.findall("<title>(.*?)</title>",html))

if __name__ == '__main__':
    # 登陆
    print("正在登录中...")
    doLogin()
    time.sleep(3)

    # 访问个人首页
    myHome()