1 2 3 4 |
r = requests.get("https://xxxxx.com/xxxx") r.text #str r.content #Byte soup = BeautifulSoup(r.content, "lxml") |
1 2 3 4 5 6 7 |
#find_all获取Tag soup.find_all("a") soup.findAll('ul',class_='image-items') soup.find_all("a", class_="link", href="/link") soup.find_all("a", attrs={"class": "link", "href": "/link"}) soup.find_all(class_="link", href="/link") soup.find_all(attrs={"class": "link", "href": "/link"}) |
1 2 3 4 5 6 |
#find获取Tag soup.find('a') soup.find(id='template-embed-sample') soup.find('a', string='hogefuga') soup.find('div', id='main') soup.find(string='hoge') == soup.find(text='hoge') |
1 2 3 4 5 6 7 8 9 10 11 12 |
#正则表达式获取Tag //获取b或者BODY tag等开始的tag import re soup.find_all(re.compile("^b")) //获取有属性link的tag import re soup.find_all(href=re.compile("link")) //获取tag中文字包含hello的tag import re soup.find_all("a", text=re.compile("hello")) |
1 2 3 4 5 |
#select获取Tag soup.select("#link1") soup.select('a[href^="http://"]') soup.select_one(".stoksPrice").string #class soup.select("div.resultContent") |
1 2 |
#其他方式获取Tag soup.a |
1 2 3 4 5 6 7 8 9 |
#获取Tag属性 soup.find("a").attrs['href'] soup.find("a").["title"] #获取Tag里边的文字 soup.a.string #获取Tag里的子Tag soup.p.find_all("a") #追加Tag属性 a["target"] = "_blank" |
1 2 3 4 5 6 7 8 9 10 11 12 |
#通过unwrap去掉Tag html = ''' <div> <a href="/link">spam</a> </div> ''' soup = BeautifulSoup(html) soup.div.a.unwrap() soup.div # <div>spam</div> |
1 2 3 4 5 6 7 8 9 |
#通过new_tagz增加Tag html = ''' <div> <a href="/link">spam</a> </div> ''' soup = BeautifulSoup(html) soup.div.a.wrap(soup.new_tag("p")) |
1 2 |
#html整形函数prettify soup.div.prettify() |