大衆點評的知识产权声明可真是霸道啊!還是自己先保存一份。下面代碼先將評論及商戶保存到sqlite數據庫,如果需要還可以導出成CSV,這樣辦公軟件就能直接打開查看了。
from bs4 import BeautifulSoup import sys,time,random,urllib,http.cookiejar,socket,sqlite3,csv goOn=1 stopDate='' UserID='' review={'shopName':'','shopAddr':'','shopURL':'','reviewURL':'','star':'', 'starDetail':'','costPerPeople':'','reviewText':'','dishes':'','reviewTime':''} def getHTML(url): print("Fetching "+url) request = urllib.request.Request(url) request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0") try: response = urllib.request.urlopen(request) except (urllib.error.HTTPError, socket.error,urllib.error.URLError) as e: print('Connection error occurred when inserting data.'+str(e)) else: if response.code != 200: print("Error code:"+response.code) else: html = response.read().decode('utf-8') return html def getList(url): global review,goOn reviewList=getHTML(url) soupAll = BeautifulSoup(reviewList).find_all("div",{"class":"txt J_rptlist"}) for soup in soupAll: shopLink = soup.find("a",{"class":"J_rpttitle"}) review['shopName']=shopLink.text review['shopURL']=shopLink.get("href") shopAddr = soup.find("p",{"class":"col-exp"}) review['shopAddr']=shopAddr.text reviewID = soup.find("a",{"class":"J_flower aheart"}) review['reviewURL']="http://www.dianping.com/review/"+reviewID.get("data-id") reviewDateDiv = soup.find("div",{"class":"mode-tc info"}) reviewDateSpan=reviewDateDiv.find("span",{"class":"col-exp"}) reviewDate=str(reviewDateSpan.text)[3:] if(len(reviewDate)==8 and reviewDate>stopDate): getReview(review['reviewURL']) #抓取頻率 time.sleep(random.randrange(5,10)) else: goOn=0 if(goOn==0): print("Finished.") exit() def save(): global review,UserID conn = sqlite3.connect('DZDB_'+UserID+'_Reviews.db') c = conn.cursor() c.execute("""create table if not exists reviews (ID integer primary key not NULL,shopName char(50),shopAddr char(100),shopURL char(100),reviewURL char(100),star char(1),starDetail char(15),costPerPeople char(15),reviewText TEXT,dishes char(100),reviewTime char(20))""") s="""insert into reviews (ID,shopName,shopAddr,shopURL,reviewURL,star,starDetail,costPerPeople,reviewText,dishes,reviewTime) VALUES (NULL,\'"""+review['shopName']+'\',\''+review['shopAddr']+'\',\''+review['shopURL']+'\',\''+review['reviewURL']+'\',\''+str(review['star'])+'\',\''+review['starDetail']+'\',\''+review['costPerPeople']+'\',\''+review['reviewText']+'\',\''+review['dishes']+'\',\''+review['reviewTime']+'\')' c.execute(s) conn.commit() c.close print("Record at "+review['shopName']+" saved to Datebase.") review={'shopName':'','shopAddr':'','shopURL':'','reviewURL':'','star':'', 'starDetail':'','costPerPeople':'','reviewText':'','dishes':'','reviewTime':''} def getReview(url): global review reviewHTML=getHTML(url) reviewAll=BeautifulSoup(reviewHTML) shopInfo= reviewAll.find("ul",{"class":"contList-info"}) star=str(shopInfo.find("li")) if("msstar50" in star): review['star']=5 elif ("msstar40" in star): review['star']=4 elif ("msstar30" in star): review['star']=3 elif ("msstar20" in star): review['star']=2 elif ("msstar10" in star): review['star']=1 else: review['star']=0 starDetails=shopInfo.find_all("span",{"class":"rst"}) starDetail="" for s in starDetails: s1=s.text[0:3] starDetail=starDetail+s1 review['starDetail']=starDetail reviewText= reviewAll.find("div",{"class":"contList-con"}) review['reviewText']=reviewText.text units= reviewAll.find_all("div",{"class":"comment-unit"}) for unit in units: unit=str(unit.text).replace('\n','') if("人均:" in unit): review['costPerPeople']=unit[4:] elif("喜欢的菜:" in unit): unit=unit.replace(' ','') unit=unit.replace('\xa0',' ') review['dishes']=unit[7:] reviewInfo= reviewAll.find("ul",{"class":"contList-fn"}) reviewTime=reviewInfo.find("li") review['reviewTime']=reviewTime.text save() def main(): fun=int(input("请输入数字选择功能:\n[1]抓取数据,[2]导出数据: \n")) if(fun==1): fetchReview() elif(fun==2): sqliteToCSV() else: print("请输入1或2。") def sqliteToCSV(): dbFile=str(input("请输入数据库文件名:\n")) with open(dbFile+'.csv','w+',newline='') as csvfile: spamwriter = csv.writer(csvfile) conn=sqlite3.connect(dbFile) c = conn.cursor() spamwriter.writerow(['ID','shopName','shopAddr','shopURL','reviewURL','star', 'starDetail','costPerPeople','reviewText','dishes','reviewTime']) for row in c.execute('SELECT * FROM reviews'): spamwriter.writerow(row) c.close() print("CSV文件成功導出。") def fetchReview(): #抓取参数:用户ID,起始页,结束日期 global stopDate,UserID UserID=str(input("请输入您的大众点评ID,可以在您大众点评主页的网址中看到,如23262500:\n")) startPageNo=int(input("开始的页码,如1:\n")) stopDate=str(input("请输入评论结束日期(yy-mm-dd),如00-00-00:\n")) urlBase="http://www.dianping.com/member/"+UserID+"/reviews?pg=" startPageNo=startPageNo-1 while(goOn==1): startPageNo=startPageNo+1 getList(urlBase+str(startPageNo)) if __name__ == "__main__": main()
幾點說明
- 抓取頻率不要過大,否則大衆點評會屏蔽IP。我在抓取到20頁左右的時候碰到過一次屏蔽IP。如果意外中斷,你可以設置參數繼續下載,附w3school的SQL基礎教程。
- BeautifulSoup真是個好工具,連Qpython3都自帶了,但是遺憾的是這個代碼在Qpython3上跑報NoneType錯誤。
- 我用了幾次都沒問題。