Commit 65873a60 authored by aserge16's avatar aserge16
Browse files

initial commit

parents
# entigen
# Aleksandr Sergeev
Final project for my computer science major at Earlham College.
python3 packages required are found in requirements.txt
----------
HOW TO USE
----------
All resources with sentences to classify need to go in the resources/ directory and have the .txt/.pdf extension.
All predictions will currently save to the resources/predictions/ directory.
There is a pre-trained model with the code, trained over 20 epochs and using an embedding layer. GloVe embeddings can be downloaded free from https://nlp.stanford.edu/projects/glove/.
All arguements can be changed in the parse_args.py file.
--- Using the Driver File ---
There is a file, driver.py, which lets the you train, validate, and classify sentences through the terminal. To run this, you need to change your current terminal directory to the code subdirectory and use the command '$ python driver.py'.
--- Using the GUI ---
If you want to use the interactive GUI for predictions, you simply have to run '$ python gui_main.py' in the code subdirectory.
--- PyInstaller ---
PyInstaller is a libary that you can use to bundle the whole GUI into an executable. You will need to first install PyInstaller.
This diff is collapsed.
This diff is collapsed.
from parse_args import ARGS
from text_process import *
import os
import train
import validate
actions = ["predict", "train model", "validate model", "exit session"]
def driver():
while True:
print("\nActions:")
for i, action in enumerate(actions):
print(str(i) + " - " + action)
idx = int(input("Enter corresponding numer to action: "))
if idx == 0:
predict()
print("Predictions completed. Thank you!")
return
elif idx == 1:
train_model()
print("New model trained. Thank you!")
return
elif idx == 2:
validate_model()
print("Model validated. Thank you!")
return
elif idx == 3:
print("Thank you!")
return
else:
print("Incorrect input, please try again")
def train_model():
train.train()
def validate_model():
validate.validate()
def predict():
entries = os.listdir(ARGS.resources_path)
entries.append("exit")
while True:
print("\nFile selection:")
for i, file in enumerate(entries):
print(str(i) + " - " + file)
idx = int(input("Enter corresponding number to file selection: "))
if idx == len(entries) - 1:
print("Exiting predict")
return
try:
file_name = entries[idx]
print("Selection: " + entries[idx])
if ".pdf" in file_name:
data = TextProcess(ARGS.resources_path + file_name, PDF=True)
elif ".txt" in file_name:
data = TextProcess(ARGS.resources_path + file_name, PDF=False)
else:
print("Unknown file format, please use only .txt or .pdf files")
except IndexError:
print("Incorrect input, please try again")
ent_predictions = {}
data.ie_preprocess()
print("pdf pre-processing completed. \n")
ent_request = ""
while True:
ent_request = input("Enter entity ('all' to process all valid sentences), or 'exit' to return: ")
if ent_request == "exit":
break
if ent_request not in ent_predictions:
pred_to_sentences = {}
ent_sentences = data.ent_preprocess(ent_request)
if len(ent_sentences) == 0:
print("Found no sentences with request %s" % (ent_request))
continue
predictions = validate.predict_classes(ent_sentences)
total = 0
for i, (pred, sent) in enumerate(zip(predictions, ent_sentences)):
sent = TextProcess.restore_sentence(sent)
if pred not in pred_to_sentences:
pred_to_sentences[pred] = [sent]
total += 1
elif sent not in pred_to_sentences[pred]:
pred_to_sentences[pred].append(sent)
total += 1
print("%d sentences with request %s valid and pre-processed" % (total, ent_request))
ent_predictions[ent_request] = pred_to_sentences
view_pred = input("Do you wish to view the predictions? yes/no: ")
if view_pred == "yes":
display_predictions(ent_request, pred_to_sentences)
to_save = input("Do you wish to save all predictions to file? yes/no: ")
if to_save == "yes":
with open(ARGS.resources_path + "predictions_" + file_name, "w") as fh:
for ent in ent_predictions:
fh.write("\n\n ENTITY - " + ent + "\n")
for pred in ent_predictions[ent]:
fh.write(validate.prediction_values[pred] + ":\n")
for sent in ent_predictions[ent][pred]:
fh.write("\t" + sent + "\n")
print("Predictions saved to " + ARGS.predictions_save_path + "predictions_" + file_name)
def display_predictions(ent, pred_to_sentences):
print("Displaying predictions for entity %s" % (ent))
while True:
for key, value in validate.prediction_values.items():
print(key, value)
try:
idx = int(input("Enter number corresponding with class you wish to view: "))
except ValueError:
print("Please enter only an integer")
continue
if not 0 <= idx <= 10:
print("Incorrect input, please try again")
continue
elif idx == 10:
break
else:
try:
sentences = pred_to_sentences[idx]
except KeyError:
print("No sentences under this class.")
continue
count_sent = len(sentences)
display = 0
print("\n\nTotal sentences for this class is %d" % (count_sent))
print("Displaying sentences in batches of 10 \n")
while display < count_sent:
print(sentences[display])
display += 1
if display % 10 == 0:
cont = input("\nDo you wish to display another batch? yes/no: ")
if cont == "no":
break
print("\n")
print("\n\n")
if __name__=='__main__':
driver()
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from parse_args import ARGS
def create_model(num_words, embedding_size, max_len, label_len, word_index, embedding):
print("Creating model")
model = Sequential()
if embedding:
print("Creating word embedding layer...\n")
embeddings_index, dim = create_word_embedding(ARGS.embedding_path)
embedding_matrix = np.zeros((len(word_index) + 1, dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(embedding_matrix), dim, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(embedding_layer)
else:
model.add(Embedding(num_words, embedding_size, input_length=max_len))
model.add(Bidirectional(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(32, activation='relu'))
model.add(Dense(label_len, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
return model
def create_word_embedding(embedding_path):
embeddings_index = dict()
print("Loading embedding file...\n")
with open(embedding_path, 'r') as fh:
lines = fh.readlines()
for line in lines:
values = line.split()
try:
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
except ValueError:
word = values[0]
idx = 0
for i in values[1:]:
if not i.isdigit():
word += " " + i
idx += 1
else:
break
coefs = np.asarray(values[idx:], dtype='float32')
embeddings_index[word] = coefs
dim = len(lines[-1].split()) - 1
print("Loaded embedding file of dimension size " + str(dim) + "\n")
return embeddings_index, dim
from parse_args import ARGS
from text_process import *
import validate
import tkinter as tk
import os
font_text = ("Helvetica", 13)
font_label = ("Helvetica", 15)
font_label2 = ("Helvetica", 14)
font_button = ("Helvetica", 13, "italic")
class Entigen(tk.Tk):
def __init__(self, *args, **kwargs):
tk.Tk.__init__(self, *args, **kwargs)
self.title("entigen")
self.geometry('800x600')
container = tk.Frame(self)
container.pack(side="top", fill="both", expand=True)
container.grid_rowconfigure(0, weight=1)
container.grid_columnconfigure(0, weight=1)
self.frames = {}
for F in (ActionPage, ProcessTextPage, PredictionPage):
page_name = F.__name__
frame = F(parent=container, controller=self)
self.frames[page_name] = frame
frame.grid(row=0, column=0, sticky="nsew")
# START FRAME
self.show_frame("ProcessTextPage")
def show_frame(self, page_name):
frame = self.frames[page_name]
frame.tkraise()
class ActionPage(tk.Frame):
def __init__(self, parent, controller):
tk.Frame.__init__(self, parent)
self.controller = controller
actions_label = tk.Label(self, text="Available Actions")
actions_label.pack()
predict_button = tk.Button(self, height=1, width=12, text="Predict",
command=lambda: controller.show_frame("ProcessTextPage"))
predict_button.pack()
train_button = tk.Button(self, height=1, width=12, text="Train model",
command=lambda: controller.show_frame("TrainPage"))
train_button.pack()
validate_button = tk.Button(self, height=1, width=12, text="Validate model",
command=lambda: controller.show_frame("ValidatePage"))
validate_button.pack()
class ProcessTextPage(tk.Frame):
def __init__(self, parent, controller):
tk.Frame.__init__(self, parent)
self.controller = controller
files = os.listdir(ARGS.resources_path)
select_label = tk.Label(self, text="Select .pdf/.txt file to process",
font=font_label).pack()
self.listbox = tk.Listbox(self, font=font_text,
width=40, height=20)
self.listbox.pack()
for file in files:
if ".pdf" in file or ".txt" in file:
self.listbox.insert(tk.END, file)
process_button = tk.Button(self, text="Process selected file", font=font_button,
command=lambda: self.preprocess_file(self.listbox.get(tk.ACTIVE)))
process_button.pack()
def preprocess_file(self, file_name):
if ".pdf" in file_name:
data = TextProcess(ARGS.resources_path + file_name, PDF=True)
elif ".txt" in file_name:
data = TextProcess(ARGS.resources_path + file_name, PDF=False)
else:
return
data.ie_preprocess()
prediction_frame = self.controller.frames["PredictionPage"]
prediction_frame.data = data
prediction_frame.processing_file.set("Processing file " + file_name)
self.controller.show_frame("PredictionPage")
class PredictionPage(tk.Frame):
def __init__(self, parent, controller):
tk.Frame.__init__(self, parent)
self.controller = controller
self.processing_file = tk.StringVar()
self.data = None
self.ent_predictions = {}
file_name = tk.Label(self, textvariable=self.processing_file,
font=font_label).pack()
entity = tk.Label(self, text="Entity Name", font=font_label2).pack()
entity_input = tk.Entry(self, font=font_text)
entity_input.pack()
process_entity = tk.Button(self, text="Process entity", font=font_button,
command=lambda: self.get_entity_sentences(entity_input))
process_entity.pack()
self.listbox = tk.Listbox(self, font=font_text,
width=40, height=20)
self.listbox.pack()
delete_entry = tk.Button(self, text="Delete Entity", font=font_button,
command=lambda: self.delete_entity())
delete_entry.pack()
show_entry = tk.Button(self, text="Show Entity Predictions", font=font_button,
command=lambda: self.show_predictions())
show_entry.pack()
save_all = tk.Button(self, text="Save Predictions to File", font=font_button,
command=lambda: self.save_all())
save_all.pack()
back = tk.Button(self, text="Back", font=font_button,
command=lambda: self.back())
back.pack()
def delete_entity(self):
lb=self.listbox
entity = lb.get(tk.ACTIVE).split(" - ")[0]
del self.ent_predictions[entity]
lb.delete(tk.ANCHOR)
def save_all(self):
file_name = self.processing_file.get().split()[2]
with open(ARGS.resources_path + "predictions/pred_" + file_name, "w") as fh:
for ent in self.ent_predictions:
fh.write("\n\nENTITY - " + ent + "\n")
for pred in self.ent_predictions[ent]:
fh.write(validate.prediction_values[pred] + ":\n")
for sent in self.ent_predictions[ent][pred]:
fh.write("\t" + sent + "\n")
def back(self):
self.ent_predictions = {}
self.listbox.delete(0, tk.END)
self.controller.show_frame("ProcessTextPage")
def show_predictions(self):
lb=self.listbox
entity = lb.get(tk.ACTIVE).split(" - ")[0]
predictions = self.ent_predictions[entity]
display_string = "ENTITY: " + entity
for pred in predictions:
display_string += "\n\n" + validate.prediction_values[pred] + ":\n"
for sent in predictions[pred]:
display_string += " - " + sent + "\n"
top = tk.Toplevel(self)
s = tk.Scrollbar(top)
text= tk.Text(top, font=font_text)
text.pack(side=tk.LEFT, fill="both", expand=True)
s.pack(side=tk.RIGHT, fill=tk.Y)
s.config(command=text.yview)
text.config(yscrollcommand=s.set)
text.insert(tk.END, display_string)
def get_entity_sentences(self, entity_input):
ent_request = entity_input.get()
if ent_request not in self.ent_predictions:
pred_to_sentences = {}
ent_sentences = self.data.ent_preprocess(ent_request)
count = len(ent_sentences)
self.listbox.insert(tk.END, ent_request + " - " + str(count))
if count == 0:
self.ent_predictions[ent_request] = pred_to_sentences
return
predictions = validate.predict_classes(ent_sentences)
for i, (pred, sent) in enumerate(zip(predictions, ent_sentences)):
sent = TextProcess.restore_sentence(sent)
if pred not in pred_to_sentences:
pred_to_sentences[pred] = [sent]
elif sent not in pred_to_sentences[pred]:
pred_to_sentences[pred].append(sent)
self.ent_predictions[ent_request] = pred_to_sentences
if __name__ == "__main__":
gui = Entigen()
gui.mainloop()
{"class_name": "Sequential", "config": [{"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": false, "batch_input_shape": [null, 100], "dtype": "float32", "input_dim": 22910, "output_dim": 300, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 100}}, {"class_name": "Bidirectional", "config": {"name": "bidirectional_1", "trainable": true, "layer": {"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 128, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.2, "recurrent_dropout": 0.2, "implementation": 1}}, "merge_mode": "concat"}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 32, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 10, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.1.5", "backend": "tensorflow"}
\ No newline at end of file
from argparse import ArgumentParser
import sys
def parse_args():
parser = ArgumentParser()
# FILES
parser.add_argument("--model_path", default="./models/model.json",
type=str, help="Model file path")
parser.add_argument("--model_weights_path", default="./models/model.h5",
type=str, help="Model weights file path")
parser.add_argument("--train_path", default="./data/TRAIN_FILE.TXT",
type=str, help="Train data file path")
parser.add_argument("--test_path", default="./data/TEST_FILE_FULL.TXT",
type=str, help="Test data file path")
parser.add_argument("--data_path", default="./data/temp_sentences.txt",
type=str, help="Processed data file path")
parser.add_argument("--resources_path", default="../resources/",
type=str, help="Resources file path")
parser.add_argument("--embedding_path", default="./data/glove.840B.300d.txt",
type=str, help="Word embedding file path")
parser.add_argument("--predictions_save_path", default="./predictions/",
type=str, help="Prediction files save path")
# VALUES
parser.add_argument("--embedding", default=False,
type=bool, help="To use word embedding or not")
parser.add_argument("--batch_size", default=40,
type=int, help="Training batch size")
parser.add_argument("--max_len", default=100,
type=int, help="Max sentence length in data")
parser.add_argument("--num_words", default=20000,
type=int, help="Max number of words to track")
parser.add_argument("--num_epochs", default=10,
type=int, help="Number of epochs")
parser.add_argument("--embedding_size", default=300,
type=int, help="Embedding layer size")
args = parser.parse_args()
return args
ARGS = parse_args()
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
def get_sentences_labels(input_path):
sentences = []
labels = []
with open(input_path, 'r') as fh:
lines = fh.readlines()
for i in range(0, len(lines), 4):
line = lines[i]
start_sent = line.find('"')
sentence = line[start_sent+1:len(line) - 2]
sentence = sentence.replace("<e1>", "E1_START ").replace("</e1>", " E1_END")
sentence = sentence.replace("<e2>", "E2_START ").replace("</e2>", " E2_END")
label = lines[i+1].rstrip().split("(")[0]
sentences.append(sentence)
labels.append(label)
return sentences, labels
def create_training_data(train_data_path, test_data_path, num_words, max_len, test_size):
sentences, labels = get_sentences_labels(train_data_path)
sentences_2, labels_2 = get_sentences_labels(test_data_path)
sentences.extend(sentences_2)
labels.extend(labels_2)
t = Tokenizer(num_words=num_words)
t.fit_on_texts(sentences)
sequences = t.texts_to_sequences(sentences)
train_data = pad_sequences(sequences, maxlen=max_len)
lb = LabelBinarizer()
labels = np.array(labels)
train_labels = lb.fit_transform(labels)
sent_train, sent_test, label_train, label_test = train_test_split(train_data , train_labels, test_size=test_size, random_state=42)
return sent_train, sent_test, label_train, label_test, t
def create_model_data(sentences, num_words, max_len):
t = Tokenizer(num_words=num_words)
t.fit_on_texts(sentences)
sequences = t.texts_to_sequences(sentences)
data = pad_sequences(sequences, maxlen=max_len)
return data
import io
import itertools
import spacy
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
class TextProcess():
def __init__(self, text_path, PDF=False):
self.text_path = text_path
print("Extracting text...")
if PDF:
self.extract_text_from_pdf()
else:
self.extract_text_from_doc()
print("Text sucessfully extracted")
def extract_text_from_pdf(self):
resource_manager = PDFResourceManager()
text_stream = io.StringIO()
converter = TextConverter(resource_manager, text_stream)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(self.text_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = text_stream.getvalue()