最近看到电影,也看了很多的评论,想了解下大多人对相关电影的评论,正好也在学习Python,就利用其爬虫的强大能力,这里利用Python3.6.1
下面是相关代码:
1 #coding:utf-8 2 __author__ = 'hang' 3 4 import warnings 5 warnings.filterwarnings("ignore") 6 import jieba #分词包 7 import numpy #numpy计算包 8 import codecs #codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode 9 import re 10 import pandas as pd 11 import matplotlib.pyplot as plt 12 from urllib import request 13 from bs4 import BeautifulSoup as bs 14 # %matplotlib inline (ipython中应用) 15 # from skimage import data 16 import matplotlib 17 matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) 18 from wordcloud import WordCloud#词云包 19 20 class KetWord: 21 def __init__(self,name,count): 22 self.name =name 23 self.count = count 24 25 def __cmp__(self, other): 26 27 if isinstance(KetWord,other): 28 if self.count > other.count: 29 return 1 30 elif self.count < other.count: 31 return -1 32 else: 33 return 0 34 35 def __str__(self): 36 return '[name='+ self.name +':count='+ str(self.count) +']' 37 #分析网页函数 38 def getNowPlayingMovie_list(): 39 resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/') 40 html_data = resp.read().decode('utf-8') 41 soup = bs(html_data, 'html.parser') 42 nowplaying_movie = soup.find_all('div', id='nowplaying') 43 nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') 44 nowplaying_list = [] 45 for item in nowplaying_movie_list: 46 nowplaying_dict = {} 47 nowplaying_dict['id'] = item['data-subject'] 48 for tag_img_item in item.find_all('img'): 49 nowplaying_dict['name'] = tag_img_item['alt'] 50 nowplaying_list.append(nowplaying_dict) 51 return nowplaying_list 52 53 #爬取评论函数 54 def getCommentsById(movieId, pageNum): 55 eachCommentList = []; 56 if pageNum>0: 57 start = (pageNum-1) * 20 58 else: 59 return False 60 requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' +'?' +'start=' + str(start) + '&limit=20' 61 print(requrl) 62 resp = request.urlopen(requrl) 63 html_data = resp.read().decode('utf-8') 64 soup = bs(html_data, 'html.parser') 65 comment_div_lits = soup.find_all('div', class_='comment') 66 for item in comment_div_lits: 67 if item.find_all('p')[0].string is not None: 68 eachCommentList.append(item.find_all('p')[0].string) 69 return eachCommentList 70 71 def main(): 72 #循环获取第一个电影的前10页评论 73 commentList = [] 74 NowPlayingMovie_list = getNowPlayingMovie_list() 75 print('common=',NowPlayingMovie_list) 76 #获取id电影[{'id': '11502973', 'name': '星际特工:千星之城'}, {'id': '25933890', 'name': '极盗车神'}, {'id': '25849480', 'name': '赛车总动员3:极速挑战'}, 77 # {'id': '26607693', 'name': '敦刻尔克'}, {'id': '26363254', 'name': '战狼2'}, {'id': '26826398', 'name': '杀破狼·贪狼'}, {'id': '26816086', 'name': '银魂 真人版'}, 78 # {'id': '26430107', 'name': '二十二'}, {'id': '26759539', 'name': '十万个冷笑话2'}, {'id': '26752106', 'name': '黑白迷宫'}, {'id': '26647876', 'name': '地球:神奇的一天'}, 79 # {'id': '26969037', 'name': '赛尔号大电影6:圣者无敌'}, {'id': '25980443', 'name': '海边的曼彻斯特'}, {'id': '26760160', 'name': '破·局'}, 80 # {'id': '27040349', 'name': '二次初恋'}, {'id': '22232939', 'name': '大耳朵图图之美食狂想曲'}, {'id': '25857966', 'name': '鲛珠传'}, {'id': '26698000', 'name': '心理罪'}, 81 # {'id': '26692823', 'name': '建军大业'}, {'id': '25823277', 'name': '三生三世十里桃花'}, {'id': '2999500', 'name': '七天'}, {'id': '27107261', 'name': '一路向爱'}, 82 # {'id': '25858758', 'name': '侠盗联盟'}, {'id': '26790961', 'name': '闪光少女'}, {'id': '26991769', 'name': '恐怖毕业照2'}, {'id': '25812712', 'name': '神偷奶爸3'}, 83 # {'id': '27107265', 'name': '杜丽娘'}] 84 for i in range(10): 85 num = i + 1 86 commentList_temp = getCommentsById(NowPlayingMovie_list[4]['id'], num) 87 commentList.append(commentList_temp) 88 89 #将列表中的数据转换为字符串 90 comments = '' 91 for k in range(len(commentList)): 92 comments = comments + (str(commentList[k])).strip() 93 94 #使用正则表达式去除标点符号 95 pattern = re.compile(r'[\u4e00-\u9fa5]+') 96 filterdata = re.findall(pattern, comments) 97 cleaned_comments = ''.join(filterdata) 98 99 #使用结巴分词进行中文分词100 segment = jieba.lcut(cleaned_comments)101 words_df=pd.DataFrame({ 'segment':segment})102 103 #去掉停用词104 stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用105 words_df=words_df[~words_df.segment.isin(stopwords.stopword)]106 107 #统计词频108 words_stat=words_df.groupby(by=['segment'])['segment'].agg({ "计数":numpy.size})109 words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)110 111 #用词云进行显示112 wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80)113 word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}114 115 #利用字典存放116 word_frequence_list = {}117 x_val = []118 y_val = []119 for key in word_frequence:120 word_frequence_list[str(key)] = word_frequence[key]121 122 wordcloud=wordcloud.generate_from_frequencies(word_frequence_list)123 print(word_frequence_list)124 125 # print('x=',x_val)126 # print('y=',y_val)127 # map = dict()128 # for i in range(len(y_val)):129 # # key_word = KetWord(x_val[i],y_val[i])130 # map[i] = KetWord(x_val[i],y_val[i])131 # for key in map:132 # print('word=',map[key])133 # plt.plot(x_val,y_val)134 # plt.show()135 plt.imshow(wordcloud)136 #既然是IPython的内置magic函数,那么在Pycharm中是不会支持的。但是我们可以在matplotlib中的pyplot身上下功夫,pyplot不会不提供展示图像的功能。137 plt.colorbar()138 plt.show()139 140 #主函数141 main()