In [1]:
from pyecharts.charts import Pie, Bar, TreeMap, Map, Geo
from wordcloud import WordCloud, ImageColorGenerator
from pyecharts import options as opts
from pyecharts.globals import ThemeType
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import numpy as np
import jieba
In [2]:
df = pd.read_csv('../file/douyin.csv',encoding = 'utf-8-sig')
df.head()
Out[2]:
name gender country province city location category fans videos likes comments shares following school custom_verify enterprise_verify signature
0 人民日报 0 中国 北京 北京 北京 政企 117259000 2427 1165446000 11906782048 9089061412 18 NaN NaN 人民日报官方账号 参与、沟通、记录时代。
1 央视新闻 0 中国 北京 北京 北京 政企 105648000 3681 3814571666 2603872833 1989050522 27 NaN 央视新闻官方账号 央视新闻官方抖音号 本宝宝暂时还没想到个性签名
2 陈赫 1 中国 NaN 上海 上海 明星 68374000 422 570096000 430908721 117639297 131 上海戏剧学院 演员陈赫 NaN 😎帅到没朋友 💁🏻‍♂️有东西直播间 🔜1⃣️1⃣️1⃣️8⃣️🕔 ☑️公众号➰[陈赫]
3 Dear-迪丽热巴 0 NaN NaN NaN NaN 明星 49790000 29 181167000 202448645 151645265 0 NaN 演员 NaN NaN
4 毒舌电影 1 中国 广东 广州 广州 剧情 46355000 616 820393000 28026109 13005392 24 NaN 优质影视自媒体、抖音影评团成员 NaN 看电影, 可以改变人生。 商务邮:dsmovie@youhaoxi.cn ❤️ 日历预售...
In [27]:
df.loc[df.gender == '0', 'gender'] = '未知'
df.loc[df.gender == '1', 'gender'] = '男性'
df.loc[df.gender == '2', 'gender'] = '女性'
# 根据性别分组
gender_message = df.groupby(['gender'])
# 对分组后的结果进行计数
gender_com = gender_message['gender'].agg(['count'])
gender_com.reset_index(inplace=True)

# 饼图数据
attr = gender_com['gender']
v1 = gender_com['count']

# 初始化配置
pie = Pie(init_opts=opts.InitOpts(width="800px", height="400px",theme=ThemeType.LIGHT))
# 添加数据,设置半径
pie.add("", [list(z) for z in zip(attr, v1)], radius=["40%", "75%"])
# 设置全局配置项,标题、图例、工具箱(下载图片)
pie.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V性别分布情况", pos_left="center", pos_top="top"),
                    legend_opts=opts.LegendOpts(orient="vertical", pos_left="left"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}))
# 设置系列配置项,标签样式
pie.set_series_opts(label_opts=opts.LabelOpts(is_show=True, formatter="{b}:{d}%",font_size=14))
pie.render_notebook()
Out[27]:
In [26]:
df = df.sort_values('likes', ascending=False)
# 获取TOP10的数据
attr = df['name'][0:10]
v1 = [float('%.1f' % (float(i) / 100000000)) for i in df['likes'][0:10]]

# 初始化配置
bar = Bar(init_opts=opts.InitOpts(width="1000px", height="600px"))
# x轴数据
bar.add_xaxis(list(reversed(attr.tolist())))
# y轴数据
bar.add_yaxis("", list(reversed(v1)),color = '#84E0E3')
# 设置全局配置项,标题、y轴分割线
bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V点赞数TOP10(亿)", pos_left="center", pos_top="18"),
                    xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)),
                    yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=12))
                   )
# 设置系列配置项,标签样式
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="right", color="black"))
bar.reversal_axis()
bar.render_notebook()
Out[26]:
In [29]:
# 将数据分段
Bins = [0, 1000000, 5000000, 10000000, 25000000, 50000000, 100000000, 5000000000]
Labels = ['0-100', '100-500', '500-1000', '1000-2500', '2500-5000', '5000-10000', '10000以上']
len_stage = pd.cut(df['likes'], bins=Bins, labels=Labels).value_counts().sort_index()
# 获取数据
attr = len_stage.index.tolist()
v1 = len_stage.values.tolist()

# 生成柱状图
bar = Bar(init_opts=opts.InitOpts(width="800px", height="400px"))
bar.add_xaxis(attr)
bar.add_yaxis("", v1,color = '#84E0E3')
bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V点赞数分布情况(万)", pos_left="center", pos_top="18"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    yaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="top", color="black"))
bar.render_notebook()
Out[29]:
In [34]:
df = df.sort_values('fans', ascending=False)
attr = df['name'][0:10]
v1 = ['%.1f' % (float(i) / 10000) for i in df['fans'][0:10]]

bar = Bar(init_opts=opts.InitOpts(width="1000px", height="600px"))
bar.add_xaxis(list(reversed(attr.tolist())))
bar.add_yaxis("", list(reversed(v1)),color = '#84E0E3')
bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V粉丝数TOP10(万)", pos_left="center", pos_top="18"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="right", color="black"))
bar.reversal_axis()
bar.render_notebook()
Out[34]:
In [37]:
Bins = [0, 1500000, 2000000, 5000000, 10000000, 25000000, 200000000]
Labels = ['0-150', '150-200', '200-500', '500-1000', '1000-2500', '5000以上']
len_stage = pd.cut(df['fans'], bins=Bins, labels=Labels).value_counts().sort_index()

attr = len_stage.index.tolist()
v1 = len_stage.values.tolist()

bar = Bar(init_opts=opts.InitOpts(width="800px", height="400px"))
bar.add_xaxis(attr)
bar.add_yaxis("", v1,color = '#84E0E3')
bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V粉丝数分布情况(万)", pos_left="center", pos_top="18"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    yaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="top", color="black"))
bar.render_notebook()
Out[37]:
In [40]:
df = df.sort_values('comments', ascending=False)
attr = df['name'][0:10]
v1 = ['%.1f' % (float(i) / 100000000) for i in df['comments'][0:10]]

bar = Bar(init_opts=opts.InitOpts(width="1000px", height="600px"))
bar.add_xaxis(list(reversed(attr.tolist())))
bar.add_yaxis("", list(reversed(v1)),color = '#84E0E3')
bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V评论数TOP10(亿)", pos_left="center", pos_top="18"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="right", color="black"))
bar.reversal_axis()
bar.render_notebook()
Out[40]:
In [44]:
df = df.sort_values('shares', ascending=False)
attr = df['name'][0:10]
v1 = ['%.1f' % (float(i) / 100000000) for i in df['shares'][0:10]]

bar = Bar(init_opts=opts.InitOpts(width="1000px", height="600px"))
bar.add_xaxis(list(reversed(attr.tolist())))
bar.add_yaxis("", list(reversed(v1)),color = '#84E0E3')

bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V分享数TOP10(亿)", pos_left="center", pos_top="18"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)),
                    yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=30))
                   )
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="right", color="black"))
bar.reversal_axis()
bar.render_notebook()
Out[44]:
In [28]:
# 分组求和
likes_type_message = df.groupby(['category'])
likes_type_com = likes_type_message['likes'].agg(['sum'])
likes_type_com.reset_index(inplace=True)
# 处理数据
dom = [{'name':name, 'value':num} for name, num in zip(likes_type_com['category'], likes_type_com['sum'])]

# 初始化配置
treemap = TreeMap(init_opts=opts.InitOpts(width="1000px", height="600px",theme=ThemeType.LIGHT))
# 添加数据
treemap.add('', dom)
# 设置全局配置项,标题、工具箱(下载图片)
treemap.set_global_opts(title_opts=opts.TitleOpts(title="各类型抖音大V点赞数汇总图", pos_left="center", pos_top="5"),
                        toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                        legend_opts=opts.LegendOpts(is_show=False),
                        
                       )
treemap.render_notebook()
Out[28]:
In [32]:
dom = []
fans_type_message = df.groupby(['category'])
fans_type_com = fans_type_message['fans'].agg(['sum'])
fans_type_com.reset_index(inplace=True)
for name, num in zip(fans_type_com['category'], fans_type_com['sum']):
    data = {}
    data['name'] = name
    data['value'] = num
    dom.append(data)

treemap = TreeMap(init_opts=opts.InitOpts(width="1000px", height="600px",theme=ThemeType.LIGHT))
treemap.add('', dom)
treemap.set_global_opts(title_opts=opts.TitleOpts(title="各类型抖音大V粉丝数汇总图", pos_left="center", pos_top="5"),
                        toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                        legend_opts=opts.LegendOpts(is_show=False))
treemap.set_series_opts(treemapbreadcrumb_opts=opts.TreeMapBreadcrumbOpts(is_show=False))
treemap.render_notebook()
Out[32]:
In [4]:
# 筛选
df = df[df['videos'] > 0]
# 计算单个视频平均点赞数
df.eval('result = likes/(videos*10000)', inplace=True)
df['result'] = df['result'].round(decimals=1)
df = df.sort_values('result', ascending=False)

# 取TOP10
attr = df['name'][0:10]
v1 = ['%.1f' % (float(i)) for i in  df['result'][0:10]]

# 初始化配置
bar = Bar(init_opts=opts.InitOpts(width="1000px", height="600px"))
# 添加数据
bar.add_xaxis(list(reversed(attr.tolist())))
bar.add_yaxis("", list(reversed(v1)),color = '#84E0E3')
# 设置全局配置项,标题、工具箱(下载图片)、y轴分割线
bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V平均视频点赞数TOP10(万)", pos_left="center", pos_top="18"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
# 设置系列配置项
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="right", color="black"))
# 翻转xy轴
bar.reversal_axis()
bar.render_notebook()
Out[4]:
In [13]:
# 筛选数据
df = df[df["country"] == "中国"]
df1 = df.copy()
# 数据替换
df1["province"] = df1["province"].str.replace("省", "").str.replace("壮族自治区", "").str.replace("维吾尔自治区", "").str.replace("自治区", "")
# 分组计数
df_num = df1.groupby("province")["province"].agg(count="count")
df_province = df_num.index.values.tolist()
df_count = df_num["count"].values.tolist()

# 初始化配置
map = Map(init_opts=opts.InitOpts(width="1000px", height="600px"))
# 中国地图
map.add("", [list(z) for z in zip(df_province, df_count)], "china")
# 设置全局配置项,标题、工具箱(下载图片)、颜色图例
map.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V省份分布情况", pos_left="center", pos_top="0"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    # 设置数值范围0-600,is_piecewise标签值连续
                    visualmap_opts=opts.VisualMapOpts(max_=600, is_piecewise=False))
map.render_notebook()
Out[13]:
In [17]:
df1 = df[(df["school"] != "") & (df["school"] != "已毕业") & (df["school"] != "未知")]
df1 = df1.copy()
df_num = df1.groupby("school")["school"].agg(count="count").reset_index().sort_values(by="count", ascending=False)
df_school = df_num[:10]["school"].values.tolist()
df_count = df_num[:10]["count"].values.tolist()

# 初始化配置
bar = Bar(init_opts=opts.InitOpts(width="1200px", height="400px"))
bar.add_xaxis(df_school)
bar.add_yaxis("", df_count,color = '#84E0E3')
bar.set_global_opts(title_opts=opts.TitleOpts(title="抖音大V毕业学校TOP10", pos_left="center", pos_top="18"),
                    toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
                    yaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="top", color="black"))
bar.render_notebook()
Out[17]:
In [21]:
"""
生成标题以及摘要词云
"""
words = pd.read_csv('../file/chineseStopWords.txt', encoding='gbk', sep='\t', names=['stopword'])
# 分词
text = ''
df1 = df[df["signature"] != ""]
df1 = df1.copy()
for line in df1['signature']:
    text += ' '.join(jieba.cut(str(line).replace(" ", ""), cut_all=False))
# 停用词
stopwords = set('')
stopwords.update(words['stopword'])
backgroud_Image = plt.imread('../file/douyin.png')
# 使用抖音背景色
alice_coloring = np.array(Image.open(r"../file/douyin.png"))
image_colors = ImageColorGenerator(alice_coloring)
wc = WordCloud(
    background_color='white',
    mask=backgroud_Image,
    font_path='../file/simhei.ttf',
    max_words=2000,
    max_font_size=70,
    min_font_size=1,
    prefer_horizontal=1,
    color_func=image_colors,
    random_state=50,
    stopwords=stopwords,
    margin=5
)
wc.generate_from_text(text)
wc.to_file('../file/douyin_word.png')
print('生成词云成功!')
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\10076\AppData\Local\Temp\jieba.cache
Loading model cost 0.550 seconds.
Prefix dict has been built successfully.
c:\users\10076\appdata\local\programs\python\python38\lib\site-packages\wordcloud\wordcloud.py:995: UserWarning: mask image should be unsigned byte between 0 and 255. Got a float array
  warnings.warn("mask image should be unsigned byte between 0"
生成词云成功!