from __future__ import unicode_literals import spacy from collections import namedtuple import pymysql from pycorenlp import StanfordCoreNLP import sys import itertools import os import csv import re from pathlib import Path import shutil reload(sys) sys.setdefaultencoding('utf8') from log import * # DB info: mysql # table aposts containing posts from stack overflow: id, title, body, tags # table reason2post containing the postid, question category ("reason"), and phrase ("how") class postprocessor(object): def __init__(self): self.host = 'localhost' self.port = 3006 self.user = 'root' self.passwd = '' self.db = 'SO_2017_Sept' self.StackOverflowPost = namedtuple('post', 'id, title, body, tags, reason, how') self.save_to_file = True self.renew_db = False self.filename = "posts.csv" self.nlp = spacy.load('en') return; def main(self, _min=0, _max=500): posts = self.__get_posts(_min, _max) progress = 0 #posts = self.__get_post_by_id(9258498) csv_file = open(self.filename, "w+") csv_writer = csv.writer(csv_file, delimiter=",".encode('utf-8')) csv_writer.writerow(["id"] + ["reason"] + ["title"]+ ["body"]+ ["how"]+ ["title_pos"]+ ["body_pos"]+ ["how_pos"]) # header for post in posts: dlog(post.id) log(post.id) _title, _body = self.__preprocess_post(post) _reason = post.reason _how = post.how+"" _title_pos = self.__run_spacy_NLP(_title) _body_pos = self.__run_spacy_NLP(_body) _how_pos = self.__run_spacy_NLP(_how) csv_writer.writerow([post.id] + [_reason] + [_title] + [_body]+ [_how] + [_title_pos] + [_body_pos] + [_how_pos] ) progress = progress + 1 if progress % 10 == 0: log("---> " + str(progress) + " posts processed") csv_file.close() return; def __save_to_txt(self, text, post_id, where, reason): # dlog("\n ### run SPACY nlp ###") filename = where + "/txt__" + str(post_id) + "__" + reason + ".txt" txtFile = open(filename, "w+") txtFile.write(text + "\r\n") txtFile.close() return; def __preprocess_post(self, post): title = post.title body = post.body title = self.__removeCodeAndHTML(title) body = self.__removeCodeAndHTML(body) title = re.sub("\/", "", title) body = re.sub("\/", "", body) return title, body; def __run_spacy_NLP(self, text): doc = self.nlp(text) out_text = '' for sen in doc.sents: # print(str(sen)+" ") #log(sen) for word in sen: # print(str(word.pos_ + " ")) out_text += word.pos_ + " " return out_text; def __get_posts(self, _min=0, _max=500): log("query for " + str(_max - _min) + " posts") query = "SELECT aposts.id, title, body, tags, r.reason, r.how FROM aposts join reason2post r on r.postId = aposts.Id where what != 'val' limit " + str( _min) + ", " + str(_max) postMap = self.__get_stackoverflowposts_from_db(query) return postMap; def __get_stackoverflowposts_from_db(self, query): conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='', db='SO_2017_Sept') cur = conn.cursor() dlog(query) cur.execute(query) post_map = map(self.StackOverflowPost._make, cur.fetchall()) cur.close() conn.close() return post_map; def __get_post_by_id(self, id): query = "SELECT aposts.id, title, body, tags, r.reason, r.how FROM aposts join reason2post r on r.postId = aposts.Id where aposts.id = " + str( id) postMap = self.__get_stackoverflowposts_from_db(query) return postMap; def __removeCodeAndHTML(self, txt): txt = self.__removeHTML(txt) dlog(txt) dlog("---------------") txt = self.__removeCode(txt) dlog(txt) return txt; def __removeCode(self, text): if "" or "
" in text: # print() pattern = re.compile('(.+?)') for m in re.finditer(pattern, text): found = m.group(1) # print(found, "----------" ,m.group(0)) if len(found.split()) == 1: txt = re.sub("{", "\\{", m.group(0)) txt = re.sub("}", "\\}", txt) txt = re.sub("\*", "\\*", txt) txt = re.sub("\[", "\\[", txt) txt = re.sub("\]", "\\]", txt) text = re.sub(txt, found, text) # print(text) elif len(found.split()) > 1: text = re.sub("(.+?)", "", text) # text = re.sub(pattern_, "", text) # print(text) # text = re.sub("
(.+?)
", "", text) dlog(text) return text; def __removeHTML(self, text): text = unicode(text, errors='replace'); text = text.encode(encoding='UTF-8'); text = text.replace("\n", ""); text = re.sub("", "", text) text = re.sub("", "", text) text = re.sub("", "", text) # text = text.replace("", ""); # text = text.replace("", ""); text = text.replace(" ", " "); text = text.replace(">", ">"); text = text.replace("<", "<"); text = text.replace("

", ""); text = text.replace("

", ""); text = text.replace("
", ""); text = text.replace("
", ""); text = text.replace("
", ""); text = text.replace("", ""); text = text.replace("", ""); text = text.replace("
", ""); text = text.replace("
", ""); text = text.replace("
", "");
        text = text.replace("
", ""); text = text.replace("", ""); text = text.replace("", ""); text = text.replace("
    ", ""); text = text.replace("
", ""); text = text.replace("
  • ", ""); text = text.replace("
  • ", ""); text = text.replace("

    ", ""); text = text.replace("

    ", ""); text = text.replace("

    ", ""); text = text.replace("

    ", ""); text = text.replace("

    ", ""); text = text.replace("

    ", ""); text = text.replace("
    ", ""); text = text.replace("
      ", ""); text = text.replace("(", ""); text = text.replace(")", ""); # print(text) return text; def __writeToCsv(self, id, reason, __window_size, text, question=False): # for all windows # postid, reason, all windowwords words = text.split() for word in itertools.islice(words, 0, __window_size): print(word + " - ") # print("hier sliding window und export in csv einbauen") return; def __save_into_db(self, query, items): conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='', db='SO_2016_Dez') cur = conn.cursor() # dlog(query) # dlog(str(items)) # l1 = [1, "blub","bla", 3] for i in items: cur.executemany(query, [i]) conn.commit() last_id = cur.lastrowid cur.close() conn.close() return last_id; def __save_words_into_db(self, words, sen_id): query = "INSERT INTO word (sentence_id, post_id, word, type, position) VALUES (" + str( sen_id) + ", %s, %s, %s, %s);" return self.__save_into_db(query, words); def __save_sentences_into_db(self, words): query = "INSERT INTO sentence (post_id, question, _where, position) VALUES (%s, %s, %s, %s);" return self.__save_into_db(query, words); def __delete_words_and_sentences(self): self.__delete_from_db("word") self.__delete_from_db("sentence") return; def __delete_from_db(self, table): conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='', db='SO_2016_Dez') cur = conn.cursor() query = "DELETE FROM " + table + " ;" dlog(query) log(query) # l1 = [1, "blub","bla", 3] cur.execute(query) conn.commit() q2 = "ALTER TABLE " + table + " AUTO_INCREMENT = 0"; cur.execute(q2) conn.commit() cur.close() conn.close() return; def __delete_all_contents_of_folder(self, directory): my_file = Path(directory) if not os.path.isdir(directory): os.makedirs(directory) else: shutil.rmtree(directory) return; postprocessor().main(0, 3000) print("FINISHED")