1、谷歌浏览器安装xpath-helper
谷歌浏览器如何扩展插件:https://jingyan.baidu.com/article/ff41162501baca12e482370a.html
2、lxml解析
# encode:utf-8
from lxml import etree
#1、解析html字符串时
text = "<div>111</div>"
htmlElement = etree.HTML(text)
str1 = etree.tostring(htmlElement,encoding='utf-8').decode('utf-8')
##2、解析文件时
#如果html文件非规范性文件,则会报错,需要重新定义parse解析器
parser = etree.HTMLParser(encoding='utf-8')
htmlElement2 = etree.parse("test.html")
str1 = etree.tostring(htmlElement,encoding='utf-8').decode('utf-8')
2、xpath抓取豆瓣TOP200电影
# encode:utf-8
import requests
from lxml import etree
#######
##1、requests获取html
########
url = "https://movie.douban.com/top250"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'
}
response = requests.get(url,headers=headers)
text = response.text
html = etree.HTML(text)
lis = html.xpath("//ol[@class='grid_view']/li")
movice = []
for li in lis:
imgurl = li.xpath(".//div[@class='pic']//img/@src")[0]
titles = li.xpath(".//div[@class='hd']//a//span")
title = ""
for ItemTi in titles:
title =title+ItemTi.text
desc = li.xpath(".//div[@class='bd']/p")[0].xpath('string(.)')
MoviceItem = {
'title':title,
'desc':desc,
'imgurl':imgurl
}
movice.append(MoviceItem)
print(movice)
# print(etree.tostring(li,encoding='utf-8'))