如何用python获取一个网页的所有连接
如何用python获取一个网页的所有连接
很简单直接上代码:
# -*- coding: utf-8 -*- ''' 如何用python获取一个网页的所有连接 author:zhangsir ''' import requests import time from lxml import etree #网址 url = 'https://www.baidu.com' #请求头 headers={ "Host": "www.baidu.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "Cookie": "BIDUPSID=C7465C07C18C65C97A99CA9252B157C5; PSTM=1663725617; BAIDUID=C7465C07C18C65C93CEFB95934D4AD90:FG=1; BD_HOME=1; H_PS_PSSID=36558_36462_36921_37300_36885_37345_36569_37405_37261_26350_37285_22157_37231; BD_UPN=1a314753; BAIDUID_BFESS=C7465C07C18C65C93CEFB95934D4AD90:FG=1; BA_HECTOR=84al0lal2184a4ah810lo25o1hiks0j19; ZFY=2znAwD2OEqUOkcGnGOgJBJQB89SW7H2KCibOnac:ALc4:C" } #爬虫 a = requests.get(url=url,headers=headers) #获取源码 b = a.text #转成lxml需要的格式 html = etree.HTML(b) #获取a标签的所有连接 link = html.xpath('//a/@href') #去重 list2 = list(set(link)) #打印 print(list2)