105 lines
No EOL
4.7 KiB
Python
105 lines
No EOL
4.7 KiB
Python
import tkinter as tk
|
|
from tkinter import ttk
|
|
from nltk.stem import WordNetLemmatizer
|
|
from datetime import datetime
|
|
import re
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from nltk.tokenize import word_tokenize
|
|
from .llm_page import LlmPage
|
|
|
|
|
|
class KeywordsPage(tk.Frame):
|
|
def __init__(self, parent, app_data):
|
|
super().__init__(parent)
|
|
self.app = parent
|
|
self.app_data = app_data
|
|
self.grid_columnconfigure(0, weight=1)
|
|
self.grid_rowconfigure(4, weight=1)
|
|
self.keywords_description = 'Here are the extracted keywords. You can modify them to your liking before feeding them into note generation. Keywords will be added to the top of the transcription file.'
|
|
|
|
tk.Label(self, text="Keywords", font=self.app_data.heading_font).grid(row=0, column=0, columnspan=2)
|
|
|
|
self.lecture_filename_label = tk.Label(self, font=self.app_data.mono_font)
|
|
self.lecture_filename_label.grid(row=1, column=0, pady=4)
|
|
|
|
tk.Label(self, text=self.keywords_description, font=self.app_data.paragraph_font, wraplength=400, justify="left").grid(row=2, column=0, columnspan=2, pady=5, padx=5)
|
|
|
|
self.keywords_textarea = tk.Text(self, wrap="word", font=self.app_data.paragraph_font)
|
|
self.keywords_textarea.grid(row=3, column=0, sticky="nsew", pady=6, padx=6)
|
|
keywords_scrollbar = tk.Scrollbar(self, command=self.keywords_textarea.yview)
|
|
keywords_scrollbar.grid(row=3, column=1, sticky="ns")
|
|
self.keywords_textarea.config(yscrollcommand=keywords_scrollbar.set)
|
|
tk.Button(self, text="Generate Notes", command=self.write_kw_and_forward_to_llm_page).grid(row=4, column=0, columnspan=2, pady=5)
|
|
|
|
|
|
def write_kw_and_forward_to_llm_page(self):
|
|
self.modified_keywords = self.keywords_textarea.get('1.0', tk.END)
|
|
self.app_data.modified_keywords = self.modified_keywords
|
|
keywords = f"Transcription keywords:\n\n{self.modified_keywords}\n"
|
|
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
|
|
with open(filename, 'r') as file:
|
|
transcription = file.read()
|
|
with open(filename, 'w') as file:
|
|
file.write(keywords)
|
|
file.write(transcription)
|
|
self.app.show_frame(LlmPage)
|
|
|
|
|
|
|
|
def start_kw_extraction_process(self, transcription_text):
|
|
if (self.app_data.lecture_filename == None):
|
|
|
|
date_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
self.app_data.lecture_filename = f"lecture_{date_time_str}"
|
|
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
|
|
self.lecture_filename_label.config(text=filename)
|
|
else:
|
|
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
|
|
self.lecture_filename_label.config(text=filename)
|
|
extracted_texts = extract_text_in_asterisks(transcription_text)
|
|
highlighted_transcription = extracted_texts + "\n\n" + transcription_text
|
|
self.app_data.highlights = extracted_texts
|
|
with open(filename, "w") as file:
|
|
file.write(highlighted_transcription)
|
|
|
|
# Extract the keywords
|
|
keywords = self.extract_topics(transcription_text)
|
|
self.keywords_textarea.delete('1.0', tk.END)
|
|
self.keywords_textarea.insert(tk.END, "\n".join(keywords))
|
|
|
|
def extract_topics(self, transcript):
|
|
"""Lemmatizing words into their simplest form"""
|
|
lemmatizer = WordNetLemmatizer()
|
|
sentences = re.split(r'[.!?]', transcript)
|
|
cleaned_data = []
|
|
# Preprocess and lemmatize each sentence
|
|
for sentence in sentences:
|
|
# Preprocess the sentence
|
|
sentence = sentence.lower()
|
|
sentence = re.sub(r'[^\w\s]', '', sentence)
|
|
# Tokenize the preprocessed sentence
|
|
words = word_tokenize(sentence)
|
|
# Lemmatize each word and join back to form the sentence
|
|
lemmatized_sentence = ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])
|
|
cleaned_data.append(lemmatized_sentence)
|
|
|
|
|
|
"""Setting tf-idf variables"""
|
|
n_samples = len(cleaned_data)
|
|
n_features = 20
|
|
|
|
data_samples = cleaned_data[:n_samples]
|
|
tfidf_vectorizer = TfidfVectorizer(
|
|
max_df=0.30,
|
|
min_df=2,
|
|
max_features=n_features,
|
|
stop_words="english"
|
|
)
|
|
tfidf = tfidf_vectorizer.fit_transform(data_samples)
|
|
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
|
|
return sorted(tfidf_feature_names)
|
|
|
|
def extract_text_in_asterisks(text):
|
|
pattern = r'\*\*(.*?)\*\*'
|
|
matches = re.findall(pattern, text)
|
|
return ",\n".join(matches) |