爬虫1–urllib与requests库使用

1、urlopen读取html

from urllib import request

url ="https://www.baidu.com"
#urlopen(url,data) url:链接 data:post模式传输数据
res = request.urlopen(url)
print(res.read()) #获取源码
print(res.getcode()) #获取网页状态

2、urlretrieve下载文件

url ="https://www.baidu.com/"
res = request.urlretrieve(url,'a.html') #下载网页或图片

3、urlencode parse_qs html格式化

from urllib import parse
data = {
    'account':"星伴同行",
    'pdw':'123'
}

datacode = parse.urlencode(data)
print(datacode)

hstr = "account=%E6%98%9F%E4%BC%B4%E5%90%8C%E8%A1%8C&pdw=123"
data = parse.parse_qs(hstr)
print(data)

4、urlparse、urlsplit 分割url

from urllib import parse
url = "https://www.baidu.com"

result = parse.urlparse(url)
result2 = parse.urlsplit(url)
print(result)
print(result2)

5、requests

#因为某些网站做了防爬虫处理,不添加header,直接urlopen返回不完整
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="

req = request.Request(url,headers=headers)
resp = request.urlopen(req)
print(resp.read())

#post 请求数据
https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
Request post
from urllib import request,parse

data = {
    'first': 'true',
    'pn': '1',
    'kd': 'python'
}
headers = {
    'Content-Length' : len(data),
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    'Cookie': 'JSESSIONID=ABAAABAABGGAAFD12BB3145F6FA95A34A8E2280E381725A; WEBTJ-ID=20200206145805-170194aa058292-0200cf6f8b6603-b383f66-1327104-170194aa059159; user_trace_token=20200206145805-713c19bf-cfdf-4c56-a74c-b1c25ca80ebc; LGUID=20200206145805-331ef79e-f9cb-4f8b-a0b5-eb1e38a449a6; _ga=GA1.2.2126362089.1580972287; _gid=GA1.2.1856246945.1580972287; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580972287; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; LGSID=20200206154258-6621e4f6-8370-43e5-9354-8c7a421916d5; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; X_HTTP_TOKEN=1c493672f1fa0791979479085186aa5fbb5541c8e7; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%2C%22%24device_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580974982; LGRID=20200206154300-886df9e1-381b-4206-8c14-540042116f5f; SEARCH_ID=48c818011d9c407ba650002bd8b11b74',
}

data = parse.urlencode(data) #编码转换
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
req = request.Request(url,headers=headers,data=bytes(data,encoding='utf-8'),method='POST')
resp = request.urlopen(req)
print(resp.read().decode('utf-8'))

ProxyHandler IP代理

# encode:utf-8
from urllib import request,parse
# print('我')
#未使用代理
url ="http://httpbin.org/ip"
# req = request.urlopen(url)
# print(req.read().decode('utf-8'))
print(1)
#使用代理 快代理 https://www.kuaidaili.com/ops/
handle = request.ProxyHandler({"http":'114.101.45.235:65309'})
opener = request.build_opener(handle)
resq = opener.open(url)
print(resq.read().decode('utf-8'))
print(2)

Cookie模块 CookieJar[保存内存]模块与MozillaCookieJar[保存文件]

# encode:utf-8
from urllib import request,parse
from http.cookiejar import  CookieJar

def get_opener():
    #1.1创建一个cookiejar对象
    cookiejar = CookieJar()
    #1.2使用cookiejar创建一个HTTPCookieProcessor
    handler = request.HTTPCookieProcessor(cookiejar)
    #1.3使用上一步创建的handler创建一个opener
    opener = request.build_opener(handler)
    return opener

def login(opener):
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    }
    data = {
        'email':"512911049@qq.com",
        'password':'******'
    }
    loginUrl = "http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20201418252"
    data = parse.urlencode(data)
    req = request.Request(loginUrl,headers=header,data=bytes(data,encoding='utf-8'))
    # opener = get_opener()
    resq = opener.open(req)

def vist_html(opener):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    }
    PCURL = "http://www.renren.com/880151247/profile"
    req = request.Request(PCURL, headers=header)
    resq = opener.open(req)
    vhtml = resq.read().decode('utf-8')
    return vhtml


opener = get_opener()
login(opener)
vhtml = vist_html(opener)
print(vhtml)
#编辑cookie
# list = []
# for cookitem in cookiejar:
#     itemcook = {
#         "key":cookitem.name,
#         "Value":cookitem.value
#     }
#     list.append(itemcook)

MozillaCookieJar 将cookie保存到文件中

# encode:utf-8
from urllib import request
from http.cookiejar import  MozillaCookieJar

#1 创建cookiejar对象
cookiejar = MozillaCookieJar('cookie.txt')
#2创建handle
handle = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handle)

resq = opener.open("http://www.baidu.com")

#ignore_discard=True 保存过期的cookie
# cookiejar.save('cookie.txt',ignore_discard=True)

#加载cookiejar
cookiejar.load('cookie.txt')
for cookitem in cookiejar:
    print("key:"+cookitem.name+"value:"+cookitem.value)

requests库学习:GET请求

# encode:utf-8
import requests

kw = {'wd':'中国'}
header = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
    'Host':'www.baidu.com',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
}
response = requests.get("https://www.baidu.com/s",params=kw,headers=header)

# 返回的是unicode格式
# print(response.text)
# 返回字节流数据 所以通过decode("utf-8")
# print(response.content.decode('utf-8'))
#查看完成url
print(response.url)
#查看响应头部字符编码
print(response.encoding)
#查看响应码
print(response.status_code)

requests库学习:POST请求

# encode:utf-8
import requests,json

data = {
    'first': 'true',
    'pn': '1',
    'kd': 'python'
}

url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
headers = {

    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    'Cookie': 'JSESSIONID=ABAAABAABGGAAFD12BB3145F6FA95A34A8E2280E381725A; WEBTJ-ID=20200206145805-170194aa058292-0200cf6f8b6603-b383f66-1327104-170194aa059159; user_trace_token=20200206145805-713c19bf-cfdf-4c56-a74c-b1c25ca80ebc; LGUID=20200206145805-331ef79e-f9cb-4f8b-a0b5-eb1e38a449a6; _ga=GA1.2.2126362089.1580972287; _gid=GA1.2.1856246945.1580972287; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580972287; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; LGSID=20200206154258-6621e4f6-8370-43e5-9354-8c7a421916d5; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; X_HTTP_TOKEN=1c493672f1fa0791979479085186aa5fbb5541c8e7; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%2C%22%24device_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580974982; LGRID=20200206154300-886df9e1-381b-4206-8c14-540042116f5f; SEARCH_ID=48c818011d9c407ba650002bd8b11b74',
}

response = requests.post(url,data=data,headers=headers)
#转换json方法1
result = response.content.decode("utf-8")
jsonResutl = json.loads(result)
print(jsonResutl)
#方法2
result2 = response.json();
print(result2)

requests:IP代理

# encode:utf-8
import requests,json

proxy = '115.216.79.93:9999'
#需要认证的代理
#proxy = 'username:password@127.0.0.1:8888'

proxies = {
    'http': 'http://' + proxy,
    'https': 'https://' + proxy,
}
try:
    response = requests.get('http://httpbin.org/ip', proxies=proxies)
    print(response.text)
except requests.exceptions.ConnectionError as e:
    print('Error', e.args)

request:cookies

# encode:utf-8
import requests,json

# url = "https://www.baidu.com"
#
# resp = requests.get(url)
# print(resp.cookies)
# print(resp.cookies.get_dict())

header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    }
data = {
    'email':"512911049@qq.com",
    'password':'*******'
}
loginUrl = "http://www.renren.com/PLogin.do"
session = requests.session()
session.post(loginUrl,data=data,headers=header)

resp = session.get("http://www.renren.com/880151247/profile")
print(resp.text)

requests:处理不信任证书

resp = requests.get("https://www.baidu.com",verify=False)