爬取豆瓣《功夫》短评，并清洗数据

2020-02-05 约 856 字预计阅读 2 分钟
爬取豆瓣《功夫》短评，并清洗数据
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129


import os
import re
import time
import random
from lxml import etree
import requests
import jieba
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 生成Session对象，用于保存Cookie
s = requests.Session()
# 影评数据保存文件
COMMENTS_FILE_PATH = 'douban_comments.txt'
# 词云字体
WC_FONT_PATH = '/Library/Fonts/Songti.ttc'


def login_douban():
    """
    登录豆瓣
    :return:
    """
    # 登录URL
    login_url = 'https://accounts.douban.com/j/mobile/login/basic'
    # 请求头
    headers = {'user-agent': 'Mozilla/5.0', 'Referer': 'https://accounts.douban.com/passport/login?source=main'}
    # 传递用户名和密码
    data = {'name': '你的账号',
            'password': '你的密码',
            'remember': 'false'}
    try:
        r = s.post(login_url, headers=headers, data=data)
        r.raise_for_status()
    except:
        print('登录请求失败')
        return 0
    # 打印请求结果
    print(r.text)
    return 1


def spider_comment(page=0):
    """
    爬取某页影评
    :param page: 分页参数
    :return:
    """
    print('开始爬取第%d页' % int(page))
    start = int(page * 20)
    comment_url = 'https://movie.douban.com/subject/1905462/comments?start=%d&limit=20&sort=new_score&status=P' % start
    # 请求头
    headers = {'user-agent': 'Mozilla/5.0'}
    try:
        r = s.get(comment_url, headers=headers)
        r.raise_for_status()
    except:
        print('第%d页爬取请求失败' % page)
        return 0
 
    result = etree.HTML(r.text)
    comment = result.xpath('//span[@class = "short"]/text()')
    if not comments:
        return 0
    # 写入文件
    with open(COMMENTS_FILE_PATH, 'a+', encoding=r.encoding) as file:
        file.writelines('\n'.join(comments))
    return 1


def batch_spider_comment():
    """
    批量爬取豆瓣影评
    :return:
    """
    # 写入数据前先清空之前的数据
    if os.path.exists(COMMENTS_FILE_PATH):
        os.remove(COMMENTS_FILE_PATH)
    page = 0
    while spider_comment(page):
        page += 1
        # 模拟用户浏览，设置一个爬虫间隔，防止ip被封
        time.sleep(random.random() * 3)
    print('爬取完毕')


def cut_word():
    """
    对数据分词
    :return: 分词后的数据
    """
    with open(COMMENTS_FILE_PATH) as file:
        comment_txt = file.read()
        wordlist = jieba.cut(comment_txt, cut_all=True)
        wl = " ".join(wordlist)
        print(wl)
        return wl


def create_word_cloud():
    """
    生成词云
    :return:
    """
    # 设置词云形状图片
    wc_mask = np.array(Image.open(WC_MASK_IMG))
    # 数据清洗词列表
    stop_words = ['就是', '不是', '但是', '还是', '只是', '这样', '这个', '一个', '什么', '电影', '没有']
    # 设置词云的一些配置，如：字体，背景色，词云形状，大小
    wc = WordCloud(background_color="white", max_words=50,  scale=4,
                   max_font_size=50, random_state=42, stopwords=stop_words, font_path=WC_FONT_PATH)
    # 生成词云
    wc.generate(cut_word())

    # 在只设置mask的情况下,你将会得到一个拥有图片形状的词云
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.figure()
    plt.show()


if __name__ == '__main__':
    # 登录成功才爬取
    # if login_douban():
    #     # spider_comment(30)
    #     batch_spider_comment()
    create_word_cloud()
目录

爬取豆瓣《功夫》短评，并清洗数据