import tkinter as tk from tkinter import ttk from nltk.stem import WordNetLemmatizer from datetime import datetime import re from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize import word_tokenize from .llm_page import LlmPage class KeywordsPage(tk.Frame): def __init__(self, parent, app_data): super().__init__(parent) self.app = parent self.app_data = app_data self.grid_columnconfigure(0, weight=1) self.grid_rowconfigure(4, weight=1) self.keywords_description = 'Here are the extracted keywords. You can modify them to your liking before feeding them into note generation. Keywords will be added to the top of the transcription file.' tk.Label(self, text="Keywords", font=self.app_data.heading_font).grid(row=0, column=0, columnspan=2) self.lecture_filename_label = tk.Label(self, font=self.app_data.mono_font) self.lecture_filename_label.grid(row=1, column=0, pady=4) tk.Label(self, text=self.keywords_description, font=self.app_data.paragraph_font, wraplength=400, justify="left").grid(row=2, column=0, columnspan=2, pady=5, padx=5) self.keywords_textarea = tk.Text(self, wrap="word", font=self.app_data.paragraph_font) self.keywords_textarea.grid(row=3, column=0, sticky="nsew", pady=6, padx=6) keywords_scrollbar = tk.Scrollbar(self, command=self.keywords_textarea.yview) keywords_scrollbar.grid(row=3, column=1, sticky="ns") self.keywords_textarea.config(yscrollcommand=keywords_scrollbar.set) tk.Button(self, text="Generate Notes", command=self.write_kw_and_forward_to_llm_page).grid(row=4, column=0, columnspan=2, pady=5) def write_kw_and_forward_to_llm_page(self): self.modified_keywords = self.keywords_textarea.get('1.0', tk.END) self.app_data.modified_keywords = self.modified_keywords keywords = f"Transcription keywords:\n\n{self.modified_keywords}\n" filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt" with open(filename, 'r') as file: transcription = file.read() with open(filename, 'w') as file: file.write(keywords) file.write(transcription) self.app.show_frame(LlmPage) def start_kw_extraction_process(self, transcription_text): if (self.app_data.lecture_filename == None): date_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") self.app_data.lecture_filename = f"lecture_{date_time_str}" filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt" self.lecture_filename_label.config(text=filename) else: filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt" self.lecture_filename_label.config(text=filename) extracted_texts = extract_text_in_asterisks(transcription_text) highlighted_transcription = extracted_texts + "\n\n" + transcription_text self.app_data.highlights = extracted_texts with open(filename, "w") as file: file.write(highlighted_transcription) # Extract the keywords keywords = self.extract_topics(transcription_text) self.keywords_textarea.delete('1.0', tk.END) self.keywords_textarea.insert(tk.END, "\n".join(keywords)) def extract_topics(self, transcript): """Lemmatizing words into their simplest form""" lemmatizer = WordNetLemmatizer() sentences = re.split(r'[.!?]', transcript) cleaned_data = [] # Preprocess and lemmatize each sentence for sentence in sentences: # Preprocess the sentence sentence = sentence.lower() sentence = re.sub(r'[^\w\s]', '', sentence) # Tokenize the preprocessed sentence words = word_tokenize(sentence) # Lemmatize each word and join back to form the sentence lemmatized_sentence = ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words]) cleaned_data.append(lemmatized_sentence) """Setting tf-idf variables""" n_samples = len(cleaned_data) n_features = 20 data_samples = cleaned_data[:n_samples] tfidf_vectorizer = TfidfVectorizer( max_df=0.30, min_df=2, max_features=n_features, stop_words="english" ) tfidf = tfidf_vectorizer.fit_transform(data_samples) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() return sorted(tfidf_feature_names) def extract_text_in_asterisks(text): pattern = r'\*\*(.*?)\*\*' matches = re.findall(pattern, text) return ",\n".join(matches)