# 爬虫
# urllib.request(自带)
read:一个字节一个字节读取
readLine:一行读取
readLines:以行读取,直到读取完毕
getcode
getUrl
getheaders
简单示例
BAIDU_URL = "http://www.baidu.com"
# 模拟请求
import urllib.request
httpRes = urllib.request.urlopen(BAIDU_URL)
# 获取读取的内容,read 返回的是字节,需要自己转为string
# read 是一个字节一个字节读取
content = httpRes.read().decode("utf-8")
print(f"内容是\n{content}")
code = httpRes.getcode()
print(code)
- 自定义request
# 模拟请求
import urllib.request
import urllib.parse
BAIDU_URL = "http://www.baidu.com"
queryData = {
"wd": "周杰伦"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
urlQueryParams = BAIDU_URL + "?" + urllib.parse.urlencode(queryData)
print(urlQueryParams)
# 自定义request
customRequest = urllib.request.Request(url=urlQueryParams, headers=headers)
httpRes = urllib.request.urlopen(customRequest)
content = httpRes.read().decode("utf-8")
print(content)
- post请求
import json
import urllib.request
import urllib.parse
url = "https://fanyi.baidu.com/sug"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
data = {
"kw": "spider"
}
# post必须要编码
data = urllib.parse.urlencode(data).encode("utf-8")
print(data)
request = urllib.request.Request(url=url, headers=headers, data=data)
httpRes = urllib.request.urlopen(request)
content = httpRes.read().decode("utf-8")
jsonData = json.loads(content)
# print(content)
print(jsonData)
# 下载
# 模拟请求
import urllib.request
BAIDU_URL = "http://www.baidu.com"
# 下载
urllib.request.urlretrieve(BAIDU_URL,"baidu.html")
# urllib.parse
- urlencode:多参数编码unicode
- quote:编码成unicode
import urllib.request
import urllib.parse
import json
REQUEST_URL = "https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
queryData = {
"pageIndex": 1,
"pageSize": 10,
"cname": "上海",
}
# 如果是get请求使用unicode编码
queryName = urllib.parse.quote("周杰伦")
queryData = urllib.parse.urlencode(queryData).encode("utf-8")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=REQUEST_URL, headers=headers, data=queryData)
httpRes = urllib.request.urlopen(request)
content = httpRes.read().decode("utf-8")
file = open("kdf.json","w",encoding="utf-8")
for item in content:
print(item)
file.write(item)
file.flush()
file.close()
# handler
- build_opener
- open
- HTTPHandler
import urllib.request
requestURl = "https://www.baidu.com/s?wd=ip"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=requestURl,headers=headers)
proxys = {
"http": "59.54.238.245:19415"
}
handler = urllib.request.ProxyHandler(proxies=proxys)
opener = urllib.request.build_opener(handler)
resp = opener.open(request)
content = resp.read().decode("utf-8")
file = open("a.html","w",encoding="utf-8")
for item in content:
file.write(item)
file.flush()
file.close()
# jsonpath (opens new window)
# 语法
-- | --- | --- |
---|---|---|
xPath | JSONPath | 描述 |
/ | $ | 表示根元素 |
. | @ | 当前元素 |
/ . | or [] | 子元素 |
.. | n/a | 父元素 |
// | .. | 递归下降,JSONPath是从E4X借鉴的。 |
* | * | 通配符,表示所有的元素 |
@ | n/a | 属性访问字符 |
[] | [] | 子元素操作符 |
| | [,] | 连接操作符在XPath 结果合并其它结点集合。JSONP允许name或者数组索引。 |
n/a | [start🔚step] | 数组分割操作从ES4借鉴。 |
[] | ?() | 应用过滤表示式 |
n/a | () | 脚本表达式,使用在脚本引擎下面。 |
() | n/a | Xpath分组 |
# 注意点
jsonpath 只能以本地的json解析
- 示例json
{ "store": {
"book": [
{ "category": "reference",
"author": "Nigel Rees",
"title": "Sayings of the Century",
"price": 8.95
},
{ "category": "fiction",
"author": "Evelyn Waugh",
"title": "Sword of Honour",
"price": 12.99
},
{ "category": "fiction",
"author": "Herman Melville",
"title": "Moby Dick",
"isbn": "0-553-21311-3",
"price": 8.99
},
{ "category": "fiction",
"author": "J. R. R. Tolkien",
"title": "The Lord of the Rings",
"isbn": "0-395-19395-8",
"price": 22.99
}
],
"bicycle": {
"author": "king",
"color": "red",
"price": 19.95
}
}
}
# jsonpath 示例
- 获取所有的author
import json
import jsonpath
obj = json.load(open("io/json.json", 'r', encoding="utf-8"))
print(obj)
authorList = jsonpath.jsonpath(obj, "$.store.book[*].author")
print(authorList)
- 获取所有的author包含bicycle的
import json
import jsonpath
obj = json.load(open("io/json.json", 'r', encoding="utf-8"))
print(obj)
authorList = jsonpath.jsonpath(obj, "$..author")
print(authorList)