# 爬虫

# urllib.request（自带）

read：一个字节一个字节读取
readLine：一行读取
readLines：以行读取，直到读取完毕
getcode
getUrl
getheaders
简单示例

BAIDU_URL = "http://www.baidu.com"

# 模拟请求
import urllib.request
httpRes = urllib.request.urlopen(BAIDU_URL)

# 获取读取的内容,read 返回的是字节，需要自己转为string
# read 是一个字节一个字节读取
content = httpRes.read().decode("utf-8")
print(f"内容是\n{content}")

code = httpRes.getcode()
print(code)

自定义request

# 模拟请求
import urllib.request
import urllib.parse

BAIDU_URL = "http://www.baidu.com"
queryData = {
    "wd": "周杰伦"
}
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

urlQueryParams = BAIDU_URL + "?" + urllib.parse.urlencode(queryData)
print(urlQueryParams)
# 自定义request
customRequest = urllib.request.Request(url=urlQueryParams, headers=headers)
httpRes = urllib.request.urlopen(customRequest)
content = httpRes.read().decode("utf-8")

print(content)

post请求

import json
import urllib.request
import urllib.parse

url = "https://fanyi.baidu.com/sug"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

data = {
    "kw": "spider"
}

# post必须要编码
data = urllib.parse.urlencode(data).encode("utf-8")
print(data)

request = urllib.request.Request(url=url, headers=headers, data=data)
httpRes = urllib.request.urlopen(request)

content = httpRes.read().decode("utf-8")
jsonData = json.loads(content)
# print(content)
print(jsonData)

# 下载

# 模拟请求
import urllib.request

BAIDU_URL = "http://www.baidu.com"
# 下载
urllib.request.urlretrieve(BAIDU_URL,"baidu.html")

# urllib.parse

urlencode：多参数编码unicode
quote：编码成unicode

import urllib.request
import urllib.parse
import json

REQUEST_URL = "https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
queryData = {
    "pageIndex": 1,
    "pageSize": 10,
    "cname": "上海",
}

# 如果是get请求使用unicode编码
queryName = urllib.parse.quote("周杰伦")

queryData = urllib.parse.urlencode(queryData).encode("utf-8")
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

request = urllib.request.Request(url=REQUEST_URL, headers=headers, data=queryData)
httpRes = urllib.request.urlopen(request)

content = httpRes.read().decode("utf-8")

file = open("kdf.json","w",encoding="utf-8")

for item in content:
    print(item)
    file.write(item)

file.flush()
file.close()

# handler

build_opener
open
HTTPHandler

import urllib.request

requestURl = "https://www.baidu.com/s?wd=ip"

headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=requestURl,headers=headers)

proxys = {
    "http": "59.54.238.245:19415"
}

handler = urllib.request.ProxyHandler(proxies=proxys)
opener = urllib.request.build_opener(handler)
resp = opener.open(request)

content = resp.read().decode("utf-8")

file = open("a.html","w",encoding="utf-8")
for item in content:
    file.write(item)


file.flush()
file.close()

# jsonpath (opens new window)

# 语法

--	---	---
xPath	JSONPath	描述
/	$	表示根元素
.	@	当前元素
/ .	or []	子元素
..	n/a	父元素
//	..	递归下降，JSONPath是从E4X借鉴的。
*	*	通配符，表示所有的元素
@	n/a	属性访问字符
[]	[]	子元素操作符
`\|`	`[,]`	连接操作符在XPath 结果合并其它结点集合。JSONP允许name或者数组索引。
n/a	[start🔚step]	数组分割操作从ES4借鉴。
`[]`	?()	应用过滤表示式
n/a	()	脚本表达式，使用在脚本引擎下面。
()	n/a	Xpath分组

# 注意点

jsonpath 只能以本地的json解析

示例json

{ "store": {
  "book": [
    { "category": "reference",
      "author": "Nigel Rees",
      "title": "Sayings of the Century",
      "price": 8.95
    },
    { "category": "fiction",
      "author": "Evelyn Waugh",
      "title": "Sword of Honour",
      "price": 12.99
    },
    { "category": "fiction",
      "author": "Herman Melville",
      "title": "Moby Dick",
      "isbn": "0-553-21311-3",
      "price": 8.99
    },
    { "category": "fiction",
      "author": "J. R. R. Tolkien",
      "title": "The Lord of the Rings",
      "isbn": "0-395-19395-8",
      "price": 22.99
    }
  ],
  "bicycle": {
    "author": "king",
    "color": "red",
    "price": 19.95
  }
}
}

# jsonpath 示例

获取所有的author

import json

import jsonpath

obj = json.load(open("io/json.json", 'r', encoding="utf-8"))
print(obj)

authorList = jsonpath.jsonpath(obj, "$.store.book[*].author")

print(authorList)

获取所有的author包含bicycle的

import json

import jsonpath

obj = json.load(open("io/json.json", 'r', encoding="utf-8"))
print(obj)

authorList = jsonpath.jsonpath(obj, "$..author")

print(authorList)

← 基础 YAML 介绍 →