UserAgent 处理
一、获取随机 UA
方法一:fake-useragent
python
pip install fake-useragentpython
from fake_useragent import UserAgent
import requests
ua = UserAgent()
headers = {"User-Agent": ua.random}
response = requests.get("https://www.baidu.com", headers=headers)支持按浏览器获取:ua.chrome、ua.firefox、ua.ie、ua.random
方法二:自建 UA 池
python
import random
UA_POOL = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
]
headers = {"User-Agent": random.choice(UA_POOL)}方法三:程序化生成
python
import random
def generate_user_agent():
os_list = ['Windows', 'Macintosh', 'Linux i686', 'Linux x86_64']
browsers = [
('Chrome', '122.0.0.0'),
('Firefox', '123.0'),
('Safari', '17.2'),
('Edge', '122.0.0.0'),
]
os = random.choice(os_list)
browser, version = random.choice(browsers)
return f'Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36'二、解析 UA
方法一:ua-parser
python
pip install ua-parserpython
from ua_parser import user_agent_parser
ua_string = 'Mozilla/5.0 (Linux; Android 6.0.1; NX531J Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.126 Mobile Safari/537.36'
parsed = user_agent_parser.Parse(ua_string)
# 返回 device、os、user_agent 等信息方法二:user-agents
python
pip install pyyaml ua-parser user-agentspython
from user_agents import parse
ua_string = 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 Version/5.1 Mobile/9B179 Safari/7534.48.3'
user_agent = parse(ua_string)
user_agent.browser.family # 'Mobile Safari'
user_agent.os.family # 'iOS'
user_agent.device.family # 'iPhone'
user_agent.is_mobile # True
user_agent.is_tablet # False
user_agent.is_bot # False三、Scrapy 中使用随机 UA
python
# middlewares.py
from fake_useragent import UserAgent
class RandomUserAgentMiddleware:
def __init__(self, crawler):
self.ua = UserAgent()
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
request.headers.setdefault('User-Agent', getattr(self.ua, self.ua_type))python
# settings.py
DOWNLOADER_MIDDLEWARES = {
'your_project.middlewares.RandomUserAgentMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
RANDOM_UA_TYPE = "random" # 或 firefox、chrome