Text data analysis

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
from nltk import FreqDist
import jieba
from matplotlib import pyplot as plt
from wordcloud import WordCloud

file_path = open('商品评价信息.csv', encoding='gbk')
file_data = pd.read_csv(file_path)
# print(file_data)

# remove duplicates
file_data = file_data.drop_duplicates()
# print(file_data)

cut_words = jieba.lcut(str(file_data['评价信息'].values))
# print(cut_words)

file_path = open('停用词表.txt', encoding='utf-8')
stop_words = file_path.read()

new_data = []
for word in cut_words:
if word not in stop_words:
new_data.append(word)

# print(new_data)

# freq
freq_list = FreqDist(new_data)

most_common_words = freq_list.most_common()
# print(most_common_words)

# show cloud
font = r'C:\Windows\Fonts\STXINGKA.TTF'
wc = WordCloud(font_path=font, background_color='white', width=1000, height=800).generate(
" ".join(new_data))
plt.imshow(wc)

plt.axis('off')
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from nltk import FreqDist
import jieba
from matplotlib import pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import Image

file_path = open('Report14-习近平在北京大学师生座谈会上的讲话.txt', encoding='utf-8')
file_data = file_path.read()

file_data = file_data.replace(' ', '')

cut_words = jieba.lcut(file_data)

# print(cut_words)

file_path = open('停用词表.txt', encoding='utf-8')
stop_words = file_path.read()

new_data = []
for word in cut_words:
if word not in stop_words:
new_data.append(word)

# freq
freq_list = FreqDist(new_data)
print(new_data)

most_common_words = freq_list.most_common()


# print(most_common_words)

def makeImage(text):
alice_mask = np.array(Image.open("picture.jpg"))

plt.figure(figsize=(9, 9))
wc = WordCloud(font_path=font, background_color='white', max_words=1000, mask=alice_mask, width=900, height=900)
# generate word cloud
wc.generate(text)

# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()


# show cloud
font = r'C:\Windows\Fonts\STXINGKA.TTF'
makeImage(" ".join(new_data))

Attachment

Other


Text data analysis
https://www.hardyhu.cn/2022/03/28/Text-data-analysis/
Author
John Doe
Posted on
March 28, 2022
Licensed under