# 爬虫

# urllib.request(自带)

  • read:一个字节一个字节读取

  • readLine:一行读取

  • readLines:以行读取,直到读取完毕

  • getcode

  • getUrl

  • getheaders

  • 简单示例

BAIDU_URL = "http://www.baidu.com"

# 模拟请求
import urllib.request
httpRes = urllib.request.urlopen(BAIDU_URL)

# 获取读取的内容,read 返回的是字节,需要自己转为string
# read 是一个字节一个字节读取
content = httpRes.read().decode("utf-8")
print(f"内容是\n{content}")

code = httpRes.getcode()
print(code)
  • 自定义request
# 模拟请求
import urllib.request
import urllib.parse

BAIDU_URL = "http://www.baidu.com"
queryData = {
    "wd": "周杰伦"
}
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

urlQueryParams = BAIDU_URL + "?" + urllib.parse.urlencode(queryData)
print(urlQueryParams)
# 自定义request
customRequest = urllib.request.Request(url=urlQueryParams, headers=headers)
httpRes = urllib.request.urlopen(customRequest)
content = httpRes.read().decode("utf-8")

print(content)
 
  • post请求
import json
import urllib.request
import urllib.parse

url = "https://fanyi.baidu.com/sug"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

data = {
    "kw": "spider"
}

# post必须要编码
data = urllib.parse.urlencode(data).encode("utf-8")
print(data)

request = urllib.request.Request(url=url, headers=headers, data=data)
httpRes = urllib.request.urlopen(request)

content = httpRes.read().decode("utf-8")
jsonData = json.loads(content)
# print(content)
print(jsonData) 

# 下载

# 模拟请求
import urllib.request

BAIDU_URL = "http://www.baidu.com"
# 下载
urllib.request.urlretrieve(BAIDU_URL,"baidu.html")

# urllib.parse

  • urlencode:多参数编码unicode
  • quote:编码成unicode
import urllib.request
import urllib.parse
import json

REQUEST_URL = "https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
queryData = {
    "pageIndex": 1,
    "pageSize": 10,
    "cname": "上海",
}

# 如果是get请求使用unicode编码
queryName = urllib.parse.quote("周杰伦")

queryData = urllib.parse.urlencode(queryData).encode("utf-8")
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

request = urllib.request.Request(url=REQUEST_URL, headers=headers, data=queryData)
httpRes = urllib.request.urlopen(request)

content = httpRes.read().decode("utf-8")

file = open("kdf.json","w",encoding="utf-8")

for item in content:
    print(item)
    file.write(item)

file.flush()
file.close()

# handler

  • build_opener
  • open
  • HTTPHandler
import urllib.request

requestURl = "https://www.baidu.com/s?wd=ip"

headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=requestURl,headers=headers)

proxys = {
    "http": "59.54.238.245:19415"
}

handler = urllib.request.ProxyHandler(proxies=proxys)
opener = urllib.request.build_opener(handler)
resp = opener.open(request)

content = resp.read().decode("utf-8")

file = open("a.html","w",encoding="utf-8")
for item in content:
    file.write(item)


file.flush()
file.close() 

# jsonpath (opens new window)

# 语法

-- --- ---
xPath JSONPath 描述
/ $ 表示根元素
. @ 当前元素
/ . or [] 子元素
.. n/a 父元素
// .. 递归下降,JSONPath是从E4X借鉴的。
* * 通配符,表示所有的元素
@ n/a 属性访问字符
[] [] 子元素操作符
| [,] 连接操作符在XPath 结果合并其它结点集合。JSONP允许name或者数组索引。
n/a [start🔚step] 数组分割操作从ES4借鉴。
[] ?() 应用过滤表示式
n/a () 脚本表达式,使用在脚本引擎下面。
() n/a Xpath分组

# 注意点

jsonpath 只能以本地的json解析

  • 示例json
{ "store": {
  "book": [
    { "category": "reference",
      "author": "Nigel Rees",
      "title": "Sayings of the Century",
      "price": 8.95
    },
    { "category": "fiction",
      "author": "Evelyn Waugh",
      "title": "Sword of Honour",
      "price": 12.99
    },
    { "category": "fiction",
      "author": "Herman Melville",
      "title": "Moby Dick",
      "isbn": "0-553-21311-3",
      "price": 8.99
    },
    { "category": "fiction",
      "author": "J. R. R. Tolkien",
      "title": "The Lord of the Rings",
      "isbn": "0-395-19395-8",
      "price": 22.99
    }
  ],
  "bicycle": {
    "author": "king",
    "color": "red",
    "price": 19.95
  }
}
}

# jsonpath 示例

  • 获取所有的author
import json

import jsonpath

obj = json.load(open("io/json.json", 'r', encoding="utf-8"))
print(obj)

authorList = jsonpath.jsonpath(obj, "$.store.book[*].author")

print(authorList)
  • 获取所有的author包含bicycle的
import json

import jsonpath

obj = json.load(open("io/json.json", 'r', encoding="utf-8"))
print(obj)

authorList = jsonpath.jsonpath(obj, "$..author")

print(authorList)