大衆點評的知识产权声明可真是霸道啊!還是自己先保存一份。下面代碼先將評論及商戶保存到sqlite數據庫,如果需要還可以導出成CSV,這樣辦公軟件就能直接打開查看了。
from bs4 import BeautifulSoup
import sys,time,random,urllib,http.cookiejar,socket,sqlite3,csv
goOn=1
stopDate=''
UserID=''
review={'shopName':'','shopAddr':'','shopURL':'','reviewURL':'','star':'',
'starDetail':'','costPerPeople':'','reviewText':'','dishes':'','reviewTime':''}
def getHTML(url):
print("Fetching "+url)
request = urllib.request.Request(url)
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0")
try:
response = urllib.request.urlopen(request)
except (urllib.error.HTTPError, socket.error,urllib.error.URLError) as e:
print('Connection error occurred when inserting data.'+str(e))
else:
if response.code != 200:
print("Error code:"+response.code)
else:
html = response.read().decode('utf-8')
return html
def getList(url):
global review,goOn
reviewList=getHTML(url)
soupAll = BeautifulSoup(reviewList).find_all("div",{"class":"txt J_rptlist"})
for soup in soupAll:
shopLink = soup.find("a",{"class":"J_rpttitle"})
review['shopName']=shopLink.text
review['shopURL']=shopLink.get("href")
shopAddr = soup.find("p",{"class":"col-exp"})
review['shopAddr']=shopAddr.text
reviewID = soup.find("a",{"class":"J_flower aheart"})
review['reviewURL']="http://www.dianping.com/review/"+reviewID.get("data-id")
reviewDateDiv = soup.find("div",{"class":"mode-tc info"})
reviewDateSpan=reviewDateDiv.find("span",{"class":"col-exp"})
reviewDate=str(reviewDateSpan.text)[3:]
if(len(reviewDate)==8 and reviewDate>stopDate):
getReview(review['reviewURL'])
#抓取頻率
time.sleep(random.randrange(5,10))
else:
goOn=0
if(goOn==0):
print("Finished.")
exit()
def save():
global review,UserID
conn = sqlite3.connect('DZDB_'+UserID+'_Reviews.db')
c = conn.cursor()
c.execute("""create table if not exists reviews (ID integer primary key not NULL,shopName char(50),shopAddr char(100),shopURL char(100),reviewURL char(100),star char(1),starDetail char(15),costPerPeople char(15),reviewText TEXT,dishes char(100),reviewTime char(20))""")
s="""insert into reviews (ID,shopName,shopAddr,shopURL,reviewURL,star,starDetail,costPerPeople,reviewText,dishes,reviewTime) VALUES (NULL,\'"""+review['shopName']+'\',\''+review['shopAddr']+'\',\''+review['shopURL']+'\',\''+review['reviewURL']+'\',\''+str(review['star'])+'\',\''+review['starDetail']+'\',\''+review['costPerPeople']+'\',\''+review['reviewText']+'\',\''+review['dishes']+'\',\''+review['reviewTime']+'\')'
c.execute(s)
conn.commit()
c.close
print("Record at "+review['shopName']+" saved to Datebase.")
review={'shopName':'','shopAddr':'','shopURL':'','reviewURL':'','star':'',
'starDetail':'','costPerPeople':'','reviewText':'','dishes':'','reviewTime':''}
def getReview(url):
global review
reviewHTML=getHTML(url)
reviewAll=BeautifulSoup(reviewHTML)
shopInfo= reviewAll.find("ul",{"class":"contList-info"})
star=str(shopInfo.find("li"))
if("msstar50" in star):
review['star']=5
elif ("msstar40" in star):
review['star']=4
elif ("msstar30" in star):
review['star']=3
elif ("msstar20" in star):
review['star']=2
elif ("msstar10" in star):
review['star']=1
else:
review['star']=0
starDetails=shopInfo.find_all("span",{"class":"rst"})
starDetail=""
for s in starDetails:
s1=s.text[0:3]
starDetail=starDetail+s1
review['starDetail']=starDetail
reviewText= reviewAll.find("div",{"class":"contList-con"})
review['reviewText']=reviewText.text
units= reviewAll.find_all("div",{"class":"comment-unit"})
for unit in units:
unit=str(unit.text).replace('\n','')
if("人均:" in unit):
review['costPerPeople']=unit[4:]
elif("喜欢的菜:" in unit):
unit=unit.replace(' ','')
unit=unit.replace('\xa0',' ')
review['dishes']=unit[7:]
reviewInfo= reviewAll.find("ul",{"class":"contList-fn"})
reviewTime=reviewInfo.find("li")
review['reviewTime']=reviewTime.text
save()
def main():
fun=int(input("请输入数字选择功能:\n[1]抓取数据,[2]导出数据: \n"))
if(fun==1):
fetchReview()
elif(fun==2):
sqliteToCSV()
else:
print("请输入1或2。")
def sqliteToCSV():
dbFile=str(input("请输入数据库文件名:\n"))
with open(dbFile+'.csv','w+',newline='') as csvfile:
spamwriter = csv.writer(csvfile)
conn=sqlite3.connect(dbFile)
c = conn.cursor()
spamwriter.writerow(['ID','shopName','shopAddr','shopURL','reviewURL','star',
'starDetail','costPerPeople','reviewText','dishes','reviewTime'])
for row in c.execute('SELECT * FROM reviews'):
spamwriter.writerow(row)
c.close()
print("CSV文件成功導出。")
def fetchReview():
#抓取参数:用户ID,起始页,结束日期
global stopDate,UserID
UserID=str(input("请输入您的大众点评ID,可以在您大众点评主页的网址中看到,如23262500:\n"))
startPageNo=int(input("开始的页码,如1:\n"))
stopDate=str(input("请输入评论结束日期(yy-mm-dd),如00-00-00:\n"))
urlBase="http://www.dianping.com/member/"+UserID+"/reviews?pg="
startPageNo=startPageNo-1
while(goOn==1):
startPageNo=startPageNo+1
getList(urlBase+str(startPageNo))
if __name__ == "__main__":
main()
幾點說明
- 抓取頻率不要過大,否則大衆點評會屏蔽IP。我在抓取到20頁左右的時候碰到過一次屏蔽IP。如果意外中斷,你可以設置參數繼續下載,附w3school的SQL基礎教程。
- BeautifulSoup真是個好工具,連Qpython3都自帶了,但是遺憾的是這個代碼在Qpython3上跑報NoneType錯誤。
- 我用了幾次都沒問題。