upload

Feuoy · Mar 9, 2020 · e2bd25c · e2bd25c
1 parent fcafd28
commit e2bd25c
Show file tree

Hide file tree

Showing 143 changed files with 1,832,710 additions and 0 deletions.
diff --git a/analysis.py b/analysis.py
@@ -0,0 +1,99 @@
+import random
+import xlrd
+import excelSave as save
+import os
+import jieba
+from snownlp import SnowNLP
+
+# 插入文本到txt中
+def insertToTxt(content, fileName):
+    with open(fileName, "a", encoding='utf-8') as file:
+        file.write(content + ",")
+
+# 插入xls
+def insert_data(elems,sentiments_temp):
+    path = "test4.xls"
+    for index,elem in enumerate(elems):
+        workbook = xlrd.open_workbook(path)  # 打开工作簿
+        sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
+        worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
+        rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
+        rid = rows_old
+        # 热词
+        word = elem[0]
+        # 词频
+        frequency = elem[1]
+        # 情绪
+        emotion = sentiments_temp[index]
+
+        if frequency >= 3 and float(emotion) >= 0.7:
+            value1 = [
+                [rid, word,frequency,emotion], ]
+            print("当前插入第%d条数据" % rid)
+            save.write_excel_xls_append_norepeat("test4.xls", value1)
+
+file = open('test2.txt',errors="ignore",encoding="utf-8")
+file_context = file.read()
+words1 = jieba.lcut(file_context)  # 全模式
+words2 = jieba.lcut_for_search(file_context)  # 搜索引擎模式
+
+# 统计词频
+data1 = {}
+for chara in words1:
+    if len(chara) < 2:
+        continue
+    if chara in data1:
+        data1[chara] += 1
+    else:
+        data1[chara] = 1
+data1 = sorted(data1.items(), key=lambda x: x[1], reverse=True)  # 排序
+
+data2 = {}
+for chara in words2:
+    if len(chara) < 2:
+        continue
+    if chara in data2:
+        data2[chara] += 1
+    else:
+        data2[chara] = 1
+data2 = sorted(data2.items(), key=lambda x: x[1], reverse=True)  # 排序
+
+print("----------")
+
+# 放存入txt的，词语*频次列表
+txt_context = []
+# 放存入txt的，情感词列表
+sentiments_temp = []
+
+print("---褒义---")
+print('词语：           频次：       情感值：' )
+for i in data2:
+    s1 = SnowNLP(i[0])
+    sentiments_temp.append(str(round(s1.sentiments,2)))
+    try:
+        if i[1] >= 10 and s1.sentiments >= 0.7:
+            # 打印
+            print(i[0] +  '           ' + str(i[1]) + '              ' + str(round(s1.sentiments,2)))
+            # 存txt
+            for j in range(i[1]):
+                txt_context.append(str(i[0]))
+    except ValueError:
+        pass
+
+#存txt
+random.shuffle(txt_context)
+txt_str = ''
+for i in txt_context:
+    txt_str = txt_str + i + "，"
+insertToTxt(txt_str, 'test3.txt')
+print("存txt，ok")
+
+# 存xls
+if os.path.exists("test4.xls"):
+    print("文件已存在")
+else:
+    print("文件不存在，重新创建")
+    value_title = [["rid", "热词", "词频", "情感值"],]
+    save.write_excel_xls("test4.xls", "统计数据", value_title)
+insert_data(data2,sentiments_temp)
+print("存xls，ok")
diff --git a/baiduNLP.py b/baiduNLP.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+
+import sys
+import json
+import base64
+import time
+
+# make it work in both python2 both python3
+IS_PY3 = sys.version_info.major == 3
+if IS_PY3:
+    from urllib.request import urlopen
+    from urllib.request import Request
+    from urllib.error import URLError
+    from urllib.parse import urlencode
+    from urllib.parse import quote_plus
+# else:
+#     import urllib2
+#     from urllib import quote_plus
+#     from urllib2 import urlopen
+#     from urllib2 import Request
+#     from urllib2 import URLError
+#     from urllib import urlencode
+
+# skip https auth
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+
+API_KEY = 'v5vb'
+SECRET_KEY = 'r8yy'
+
+COMMENT_TAG_URL = "https://aip.baidubce.com/rpc/2.0/nlp/v2/comment_tag"
+
+"""  TOKEN start """
+TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token'
+
+"""
+    get token
+"""
+def fetch_token():
+    params = {'grant_type': 'client_credentials',
+              'client_id': API_KEY,
+              'client_secret': SECRET_KEY}
+    post_data = urlencode(params)
+    if (IS_PY3):
+        post_data = post_data.encode('utf-8')
+    req = Request(TOKEN_URL, post_data)
+    try:
+        f = urlopen(req, timeout=5)
+        result_str = f.read()
+    except URLError as err:
+        print(err)
+    if (IS_PY3):
+        result_str = result_str.decode()
+
+    result = json.loads(result_str)
+
+    if ('access_token' in result.keys() and 'scope' in result.keys()):
+        if not 'brain_all_scope' in result['scope'].split(' '):
+            print ('please ensure has check the  ability')
+            exit()
+        return result['access_token']
+    else:
+        print ('please overwrite the correct API_KEY and SECRET_KEY')
+        exit()
+
+"""
+    call remote http server
+"""
+def make_request(url, comment):
+    print("---------------------------------------------------")
+    print("评论文本：")
+    # print("    " + comment)
+    print("\n评论观点：")
+
+    response = request(url, json.dumps(
+    {
+        "text": comment,
+        # 13为3C手机类型评论，其他类别评论请参考 https://ai.baidu.com/docs#/NLP-Apply-API/09fc895f
+        "type": 11
+    }))
+
+    data = json.loads(response)
+
+    if "error_code" not in data or data["error_code"] == 0:
+        for item in data["items"]:
+            # 积极的评论观点
+            if item["sentiment"] == 2:
+                print(u"    积极的评论观点: " + item["prop"] + item["adj"])
+            # 中性的评论观点
+            if item["sentiment"] == 1:
+                print(u"    中性的评论观点: " + item["prop"] + item["adj"])
+            # 消极的评论观点
+            if item["sentiment"] == 0:
+                print(u"    消极的评论观点: " + item["prop"] + item["adj"])
+    else:
+        # print error response
+        print(response)
+
+    # 防止qps超限
+    time.sleep(0.5)
+
+"""
+    call remote http server
+"""
+def request(url, data):
+    req = Request(url, data.encode('utf-8'))
+    has_error = False
+    try:
+        f = urlopen(req)
+        result_str = f.read()
+        if (IS_PY3):
+            result_str = result_str.decode()
+        return result_str
+    except  URLError as err:
+        print(err)
+
+if __name__ == '__main__':
+
+    # comment1 = "手机已经收到，非常完美超出自己的想象，外观惊艳 黑色高端加外形时尚融为一体比较喜欢的类型。系统流畅优化的很好，操作界面简洁大方好上手。电池用量很满意，快充很不错。相机拍人拍物都美。总而言之一句话很喜欢的宝贝。"
+    # comment2 = "外观精美大小正合适，做工精细，线条流畅，拍照完美，吃鸡最高画质无压力。连续玩了三个小时掉电百分之二十，电池强劲持久，无明显发热，操作流畅，准备再买一台给老婆生日礼物！"
+    # comment3 = "大家千万不要在上当了，耗电特别快，手机激活后不支持7天无理由退货，请大家小心购买"
+
+    file = open('test2.txt', errors="ignore", encoding="utf-8")
+    file_context = file.read()
+    comment1 = file_context
+
+    # get access token
+    token = fetch_token()
+
+    # concat url
+    url = COMMENT_TAG_URL + "?charset=UTF-8&access_token=" + token
+
+    make_request(url, comment1)
+    # make_request(url, comment2)
+    # make_request(url, comment3)
diff --git a/excelSave.py b/excelSave.py
@@ -0,0 +1,55 @@
+import xlrd
+import xlwt
+from xlutils.copy import copy
+
+def write_excel_xls(path, sheet_name, value):
+    index = len(value)  # 获取需要写入数据的行数
+    workbook = xlwt.Workbook()  # 新建一个工作簿
+    sheet = workbook.add_sheet(sheet_name)  # 在工作簿中新建一个表格
+    for i in range(0, index):
+        for j in range(0, len(value[i])):
+            sheet.write(i, j, value[i][j])  # 像表格中写入数据（对应的行和列）
+    workbook.save(path)  # 保存工作簿
+    print("xls格式表格写入数据成功！")
+
+def read_excel_xls(path):
+    data = []
+    workbook = xlrd.open_workbook(path)  # 打开工作簿
+    sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
+    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
+    if worksheet.nrows == 1:
+        print("目前是第一行")
+    else:
+        for i in range(1, worksheet.nrows): #从第二行取值
+            dataTemp = []
+            for j in range(0, worksheet.ncols):
+                #print(worksheet.cell_value(i, j), "\t", end="")  # 逐行逐列读取数据
+                dataTemp.append(worksheet.cell_value(i, j))
+            data.append(dataTemp)
+    return data
+
+def write_excel_xls_append_norepeat(path, value):
+    workbook = xlrd.open_workbook(path)  # 打开工作簿
+    sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
+    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
+    rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
+    new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
+    new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
+    rid = 0
+    for i in range(0, len(value)):
+        data = read_excel_xls(path)
+        data_temp = []
+        for m in range(0,len(data)):
+            data_temp.append(data[m][1:len(data[m])])
+        value_temp = []
+        for m in range(0,len(value)):
+            value_temp.append(value[m][1:len(value[m])])
+
+        if value_temp[i] not in data_temp:
+            for j in range(0, len(value[i])):
+                new_worksheet.write(rid+rows_old, j, value[i][j])  # 追加写入数据，注意是从i+rows_old行开始写入
+            rid = rid + 1
+            new_workbook.save(path)  # 保存工作簿
+            print("xls格式表格【追加】写入数据成功！")
+        else:
+            print("数据重复")