from urllib import request,error
import re,time,json
def getPage(url):
'''爬取指定url页面信息'''
try:
#定义请求头信息
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
# 封装请求对象
req = request.Request(url,headers=headers)
# 执行爬取
res = request.urlopen(req)
#判断响应状态,并响应爬取内容
if res.code == 200:
return res.read().decode("utf-8")
else:
return None
except error.URLError:
return None
def parsePage(html):
'''解析爬取网页中的内容,并返回字段结果'''
#定义解析正则表达式
pat = '<p class="name"><a.*?href="(.*?)"[^<>]*>(.*?)</a></p>\s+<p class="star">\s+(.*?)\s+</p>\s+<p class="releasetime">(.*?)</p>'
#执行解析
items = re.findall(pat,html,re.S)
#遍历封装数据并返回
print(items)
url = "https://maoyan.com/board/4"
html = getPage(url)
parsePage(html)