1、urlopen读取html
from urllib import request
url ="https://www.baidu.com"
#urlopen(url,data) url:链接 data:post模式传输数据
res = request.urlopen(url)
print(res.read()) #获取源码
print(res.getcode()) #获取网页状态
2、urlretrieve下载文件
url ="https://www.baidu.com/"
res = request.urlretrieve(url,'a.html') #下载网页或图片
3、urlencode parse_qs html格式化
from urllib import parse
data = {
'account':"星伴同行",
'pdw':'123'
}
datacode = parse.urlencode(data)
print(datacode)
hstr = "account=%E6%98%9F%E4%BC%B4%E5%90%8C%E8%A1%8C&pdw=123"
data = parse.parse_qs(hstr)
print(data)
4、urlparse、urlsplit 分割url
from urllib import parse
url = "https://www.baidu.com"
result = parse.urlparse(url)
result2 = parse.urlsplit(url)
print(result)
print(result2)
5、requests
#因为某些网站做了防爬虫处理,不添加header,直接urlopen返回不完整
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
req = request.Request(url,headers=headers)
resp = request.urlopen(req)
print(resp.read())
#post 请求数据
https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
Request post
from urllib import request,parse
data = {
'first': 'true',
'pn': '1',
'kd': 'python'
}
headers = {
'Content-Length' : len(data),
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'Cookie': 'JSESSIONID=ABAAABAABGGAAFD12BB3145F6FA95A34A8E2280E381725A; WEBTJ-ID=20200206145805-170194aa058292-0200cf6f8b6603-b383f66-1327104-170194aa059159; user_trace_token=20200206145805-713c19bf-cfdf-4c56-a74c-b1c25ca80ebc; LGUID=20200206145805-331ef79e-f9cb-4f8b-a0b5-eb1e38a449a6; _ga=GA1.2.2126362089.1580972287; _gid=GA1.2.1856246945.1580972287; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580972287; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; LGSID=20200206154258-6621e4f6-8370-43e5-9354-8c7a421916d5; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; X_HTTP_TOKEN=1c493672f1fa0791979479085186aa5fbb5541c8e7; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%2C%22%24device_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580974982; LGRID=20200206154300-886df9e1-381b-4206-8c14-540042116f5f; SEARCH_ID=48c818011d9c407ba650002bd8b11b74',
}
data = parse.urlencode(data) #编码转换
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
req = request.Request(url,headers=headers,data=bytes(data,encoding='utf-8'),method='POST')
resp = request.urlopen(req)
print(resp.read().decode('utf-8'))
ProxyHandler IP代理
# encode:utf-8
from urllib import request,parse
# print('我')
#未使用代理
url ="http://httpbin.org/ip"
# req = request.urlopen(url)
# print(req.read().decode('utf-8'))
print(1)
#使用代理 快代理 https://www.kuaidaili.com/ops/
handle = request.ProxyHandler({"http":'114.101.45.235:65309'})
opener = request.build_opener(handle)
resq = opener.open(url)
print(resq.read().decode('utf-8'))
print(2)
Cookie模块 CookieJar[保存内存]模块与MozillaCookieJar[保存文件]
# encode:utf-8
from urllib import request,parse
from http.cookiejar import CookieJar
def get_opener():
#1.1创建一个cookiejar对象
cookiejar = CookieJar()
#1.2使用cookiejar创建一个HTTPCookieProcessor
handler = request.HTTPCookieProcessor(cookiejar)
#1.3使用上一步创建的handler创建一个opener
opener = request.build_opener(handler)
return opener
def login(opener):
header={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
data = {
'email':"512911049@qq.com",
'password':'******'
}
loginUrl = "http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20201418252"
data = parse.urlencode(data)
req = request.Request(loginUrl,headers=header,data=bytes(data,encoding='utf-8'))
# opener = get_opener()
resq = opener.open(req)
def vist_html(opener):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
PCURL = "http://www.renren.com/880151247/profile"
req = request.Request(PCURL, headers=header)
resq = opener.open(req)
vhtml = resq.read().decode('utf-8')
return vhtml
opener = get_opener()
login(opener)
vhtml = vist_html(opener)
print(vhtml)
#编辑cookie
# list = []
# for cookitem in cookiejar:
# itemcook = {
# "key":cookitem.name,
# "Value":cookitem.value
# }
# list.append(itemcook)
MozillaCookieJar 将cookie保存到文件中
# encode:utf-8
from urllib import request
from http.cookiejar import MozillaCookieJar
#1 创建cookiejar对象
cookiejar = MozillaCookieJar('cookie.txt')
#2创建handle
handle = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handle)
resq = opener.open("http://www.baidu.com")
#ignore_discard=True 保存过期的cookie
# cookiejar.save('cookie.txt',ignore_discard=True)
#加载cookiejar
cookiejar.load('cookie.txt')
for cookitem in cookiejar:
print("key:"+cookitem.name+"value:"+cookitem.value)
requests库学习:GET请求
# encode:utf-8
import requests
kw = {'wd':'中国'}
header = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
'Host':'www.baidu.com',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
}
response = requests.get("https://www.baidu.com/s",params=kw,headers=header)
# 返回的是unicode格式
# print(response.text)
# 返回字节流数据 所以通过decode("utf-8")
# print(response.content.decode('utf-8'))
#查看完成url
print(response.url)
#查看响应头部字符编码
print(response.encoding)
#查看响应码
print(response.status_code)
requests库学习:POST请求
# encode:utf-8
import requests,json
data = {
'first': 'true',
'pn': '1',
'kd': 'python'
}
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'Cookie': 'JSESSIONID=ABAAABAABGGAAFD12BB3145F6FA95A34A8E2280E381725A; WEBTJ-ID=20200206145805-170194aa058292-0200cf6f8b6603-b383f66-1327104-170194aa059159; user_trace_token=20200206145805-713c19bf-cfdf-4c56-a74c-b1c25ca80ebc; LGUID=20200206145805-331ef79e-f9cb-4f8b-a0b5-eb1e38a449a6; _ga=GA1.2.2126362089.1580972287; _gid=GA1.2.1856246945.1580972287; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580972287; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; LGSID=20200206154258-6621e4f6-8370-43e5-9354-8c7a421916d5; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; X_HTTP_TOKEN=1c493672f1fa0791979479085186aa5fbb5541c8e7; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%2C%22%24device_id%22%3A%221701973c369192-0792cca33d4581-b383f66-1327104-1701973c36a299%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1580974982; LGRID=20200206154300-886df9e1-381b-4206-8c14-540042116f5f; SEARCH_ID=48c818011d9c407ba650002bd8b11b74',
}
response = requests.post(url,data=data,headers=headers)
#转换json方法1
result = response.content.decode("utf-8")
jsonResutl = json.loads(result)
print(jsonResutl)
#方法2
result2 = response.json();
print(result2)
requests:IP代理
# encode:utf-8
import requests,json
proxy = '115.216.79.93:9999'
#需要认证的代理
#proxy = 'username:password@127.0.0.1:8888'
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
try:
response = requests.get('http://httpbin.org/ip', proxies=proxies)
print(response.text)
except requests.exceptions.ConnectionError as e:
print('Error', e.args)
request:cookies
# encode:utf-8
import requests,json
# url = "https://www.baidu.com"
#
# resp = requests.get(url)
# print(resp.cookies)
# print(resp.cookies.get_dict())
header={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
data = {
'email':"512911049@qq.com",
'password':'*******'
}
loginUrl = "http://www.renren.com/PLogin.do"
session = requests.session()
session.post(loginUrl,data=data,headers=header)
resp = session.get("http://www.renren.com/880151247/profile")
print(resp.text)
requests:处理不信任证书
resp = requests.get("https://www.baidu.com",verify=False)