Text data analysis

import pandas as pd
from nltk import FreqDist
import jieba
from matplotlib import pyplot as plt
from wordcloud import WordCloud

file_path = open('商品评价信息.csv', encoding='gbk')
file_data = pd.read_csv(file_path)
# print(file_data)

# remove duplicates
file_data = file_data.drop_duplicates()
# print(file_data)

cut_words = jieba.lcut(str(file_data['评价信息'].values))
# print(cut_words)

file_path = open('停用词表.txt', encoding='utf-8')
stop_words = file_path.read()

new_data = []
for word in cut_words:
    if word not in stop_words:
        new_data.append(word)

# print(new_data)

# freq
freq_list = FreqDist(new_data)

most_common_words = freq_list.most_common()
# print(most_common_words)

# show cloud
font = r'C:\Windows\Fonts\STXINGKA.TTF'
wc = WordCloud(font_path=font, background_color='white', width=1000, height=800).generate(
    " ".join(new_data))
plt.imshow(wc)

plt.axis('off')
plt.show()

from nltk import FreqDist
import jieba
from matplotlib import pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import Image

file_path = open('Report14-习近平在北京大学师生座谈会上的讲话.txt', encoding='utf-8')
file_data = file_path.read()

file_data = file_data.replace(' ', '')

cut_words = jieba.lcut(file_data)

# print(cut_words)

file_path = open('停用词表.txt', encoding='utf-8')
stop_words = file_path.read()

new_data = []
for word in cut_words:
    if word not in stop_words:
        new_data.append(word)

# freq
freq_list = FreqDist(new_data)
print(new_data)

most_common_words = freq_list.most_common()


# print(most_common_words)

def makeImage(text):
    alice_mask = np.array(Image.open("picture.jpg"))

    plt.figure(figsize=(9, 9))
    wc = WordCloud(font_path=font, background_color='white', max_words=1000, mask=alice_mask, width=900, height=900)
    # generate word cloud
    wc.generate(text)

    # show
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()


# show cloud
font = r'C:\Windows\Fonts\STXINGKA.TTF'
makeImage(" ".join(new_data))