使用python爬取某购物网站的评论信息:
客户提供的URL信息为TXT文件:
代码实现:
如有疑问及其它需求可私信我,谢谢!
#导入库
from bs4 import BeautifulSoup
import json
import random
import requests
import re
import os
import sys
import pandas as pd
from time import sleep
#伪装浏览器信息,使用requests采集网页信息
def get_html(url):
headers1={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie':'session-id=132-1720367-9562261; session-id-time=2082787201l; i18n-prefs=USD; lc-main=zh_CN; sp-cdn="L5Z9:CN"; ubid-main=133-6420846-7804925; session-token="O3UlULvvCRokkDroX8qSnLtxqVwN7eEzOnwXRMPb4n49t7LOhY0X9ZXCylrXR8E2QuCTUFFBiaepsfckFqKkhgen/yXoxaqah3pyrHnEr0dof1qgLBnKiBvaOmOR81sa/NG1R6edkbXZMHQlyVOWclSZCAQ/E3hesiWntIIGpTLqTZWWvVSxpgTkpBxo7kTcFO6ouRwaKQvx5sngUjRCGoGTnhf6GtRQKWF4yRnhdDw="; csm-hit=tb:48HQZ7V78BDK4P2A1E0Z+s-48HQZ7V78BDK4P2A1E0Z|1674827897763&t:1674827897763&adb:adblk_no',
'sec-ch-ua': '"Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
result = requests.get(url, headers = headers1)
if result.status_code == 200:
#The response is successful and a string is returned
print("获取网页:",url,"成功")
return result.text
return
#新建excel模板
df=pd.DataFrame({"url":[],"bookname":[],"author":[],"all_grade":[],"grade":[],"name":[],"title":[],"review":[]})
#读取网址
f=open("url.txt","r",encoding="utf-8")
f_lst=f.readlines()
row=0
#遍历网址
for u in f_lst:
#print(u)
url=re.findall("^https:.*",u)
if len(url)>0:
#print("获取网页:",url[0])
#获取网页内容
html=get_html(url[0])
bsSoup = BeautifulSoup(html, 'html.parser')
#抓取评论
award=bsSoup.find('p',class_ ="a-fixed-left-grid-col aok-align-center a-col-right")
grade=award.text
print(grade)
#抓取书名
award1=bsSoup.find('p',class_ ="a-fixed-left-grid-col product-info a-col-right")
bookname=award1.text
print(bookname)
#抓取作者
award3=bsSoup.find('p',class_ ="a-row product-by-line")
author=award3.text
print(author)
#全球评分
award4=bsSoup.find('p',class_ ="a-row a-spacing-medium averageStarRatingNumerical")
all_grade=award4.text
print(all_grade)
#抓取评论信息
award2=bsSoup.find('p',class_ ="a-section a-spacing-none review-views celwidget")
pinglun_lst=award2.find_all("p",class_="a-section celwidget")
for i in pinglun_lst:
#print(i.find("p",class_="a-profile-content").text)
#print(i.find("p",class_="a-row a-spacing-small review-data").text)
#print(i.find("a",class_="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold").text)
df.loc[row,"url"]=url[0]
df.loc[row,"author"]=author
df.loc[row,"grade"]=grade.split(",")[0]
df.loc[row,"all_grade"]=all_grade
df.loc[row,"bookname"]=bookname
df.loc[row,"name"]=i.find("p",class_="a-profile-content").text
try:
df.loc[row,"title"]=i.find("a",class_="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold").text.replace("
","")
except:
df.loc[row,"title"]=i.find("span",class_="a-size-base review-title a-color-base review-title-content a-text-bold").text.replace("
","")
df.loc[row,"review"]=i.find("p",class_="a-row a-spacing-small review-data").text.replace("
","")
row+=1
sleep(2)
df.to_excel("book.xlsx")
整体采集比较简单,使用开发者模式找到网页对应信息的标签值,使用BeautifulSoup实现,最终结果使用Pandas保存成excel文件
如有疑问及其它需求可私信我,谢谢!
页面更新:2024-03-01
本站资料均由网友自行发布提供,仅用于学习交流。如有版权问题,请与我联系,QQ:4156828
© CopyRight 2008-2024 All Rights Reserved. Powered By bs178.com 闽ICP备11008920号-3
闽公网安备35020302034844号