llm selection, notes saving, cleanup

This commit is contained in:
Martin Jaros 2024-05-02 00:12:45 +02:00
commit 44a82f4af8
13 changed files with 910 additions and 130 deletions

17
Dockerfile Normal file
View file

@ -0,0 +1,17 @@
FROM python:3.10.12
WORKDIR /LectureSummarizer
COPY . /LectureSummarizer
RUN pip install --no-cache-dir \
sounddevice \
soundfile \
numpy \
openai-whisper \
torch \
scikit-learn \
nltk \
llama-cpp-python
CMD ["python", "main.py"]

817
RAG.ipynb Normal file

File diff suppressed because one or more lines are too long

View file

@ -1,7 +1,7 @@
# Lecture Summarizer # Lecture Summarizer
## Description ## Description
Tkinter application to record text with openai-whisper, extract keyword from transcription with sklearn's TF-IDF and generate notes with gguf llm model Tkinter application to record text with openai-whisper, extract keywords from transcription with sklearn's TF-IDF and generate notes with gguf llm model
## Requirements ## Requirements
Python 3.10.12 Python 3.10.12
@ -16,20 +16,29 @@ Clone the repository:
```bash ```bash
git clone https://github.com/JRoshthen1/LectureSummarizer.git git clone https://github.com/JRoshthen1/LectureSummarizer.git
``` ```
Navigate to the project directory: Navigate to the project directory:
```bash ```bash
cd LectureSummarizer/ cd LectureSummarizer/
``` ```
Install dependencies: Install dependencies:
```bash ```bash
pip install -r requirements.txt pip install tkinter sounddevice soundfile numpy openai-whisper torch scikit-learn nltk llama-cpp-python
``` ```
## Usage ## Usage
1. Create `recordings` directory in the root of the application (./recordings/)
2. Download a gguf model for text generation, [examples](https://huggingface.co/models?library=gguf)
3. Run the app
```bash
python3 main.py
```
## Contributing ## Contributing

View file

@ -3,8 +3,7 @@ from tkinter import ttk
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
from datetime import datetime from datetime import datetime
import re import re
#from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from .llm_page import LlmPage from .llm_page import LlmPage
@ -17,7 +16,6 @@ class KeywordsPage(tk.Frame):
self.grid_columnconfigure(0, weight=1) self.grid_columnconfigure(0, weight=1)
self.grid_rowconfigure(4, weight=1) self.grid_rowconfigure(4, weight=1)
self.keywords_description = 'Here are the extracted keywords. You can modify them to your liking before feeding them into note generation. Keywords will be added to the top of the transcription file.' self.keywords_description = 'Here are the extracted keywords. You can modify them to your liking before feeding them into note generation. Keywords will be added to the top of the transcription file.'
tk.Label(self, text="Keywords", font=self.app_data.heading_font).grid(row=0, column=0, columnspan=2) tk.Label(self, text="Keywords", font=self.app_data.heading_font).grid(row=0, column=0, columnspan=2)
@ -28,18 +26,15 @@ class KeywordsPage(tk.Frame):
self.keywords_textarea = tk.Text(self, wrap="word", font=self.app_data.paragraph_font) self.keywords_textarea = tk.Text(self, wrap="word", font=self.app_data.paragraph_font)
self.keywords_textarea.grid(row=3, column=0, sticky="nsew", pady=6, padx=6) self.keywords_textarea.grid(row=3, column=0, sticky="nsew", pady=6, padx=6)
keywords_scrollbar = tk.Scrollbar(self, command=self.keywords_textarea.yview) keywords_scrollbar = tk.Scrollbar(self, command=self.keywords_textarea.yview)
keywords_scrollbar.grid(row=3, column=1, sticky="ns") keywords_scrollbar.grid(row=3, column=1, sticky="ns")
self.keywords_textarea.config(yscrollcommand=keywords_scrollbar.set) self.keywords_textarea.config(yscrollcommand=keywords_scrollbar.set)
tk.Button(self, text="Generate Notes", command=self.write_kw_and_forward_to_llm_page).grid(row=4, column=0, columnspan=2, pady=5) tk.Button(self, text="Generate Notes", command=self.write_kw_and_forward_to_llm_page).grid(row=4, column=0, columnspan=2, pady=5)
def write_kw_and_forward_to_llm_page(self): def write_kw_and_forward_to_llm_page(self):
self.modified_keywords = self.keywords_textarea.get('1.0', tk.END) self.modified_keywords = self.keywords_textarea.get('1.0', tk.END)
self.app_data.modified_keywords = self.modified_keywords self.app_data.modified_keywords = self.modified_keywords
#print(self.app_data.modified_keywords)
keywords = f"Transcription keywords:\n\n{self.modified_keywords}\n" keywords = f"Transcription keywords:\n\n{self.modified_keywords}\n"
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt" filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
with open(filename, 'r') as file: with open(filename, 'r') as file:
@ -50,9 +45,10 @@ class KeywordsPage(tk.Frame):
self.app.show_frame(LlmPage) self.app.show_frame(LlmPage)
def start_kw_extraction_process(self, transcription_text): def start_kw_extraction_process(self, transcription_text):
# Save the transcription to a text file if (self.app_data.lecture_filename == None):
if (self.app_data.lecture_filename == None):
date_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") date_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
self.app_data.lecture_filename = f"lecture_{date_time_str}" self.app_data.lecture_filename = f"lecture_{date_time_str}"
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt" filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
@ -60,9 +56,11 @@ class KeywordsPage(tk.Frame):
else: else:
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt" filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
self.lecture_filename_label.config(text=filename) self.lecture_filename_label.config(text=filename)
extracted_texts = extract_text_in_asterisks(transcription_text)
highlighted_transcription = extracted_texts + "\n\n" + transcription_text
self.app_data.highlights = extracted_texts
with open(filename, "w") as file: with open(filename, "w") as file:
file.write(transcription_text) file.write(highlighted_transcription)
# Extract the keywords # Extract the keywords
keywords = self.extract_topics(transcription_text) keywords = self.extract_topics(transcription_text)
@ -72,13 +70,8 @@ class KeywordsPage(tk.Frame):
def extract_topics(self, transcript): def extract_topics(self, transcript):
"""Lemmatizing words into their simplest form""" """Lemmatizing words into their simplest form"""
lemmatizer = WordNetLemmatizer() lemmatizer = WordNetLemmatizer()
# Split transcript into sentences
sentences = re.split(r'[.!?]', transcript) sentences = re.split(r'[.!?]', transcript)
# Initialize list to store lemmatized data
cleaned_data = [] cleaned_data = []
# Preprocess and lemmatize each sentence # Preprocess and lemmatize each sentence
for sentence in sentences: for sentence in sentences:
# Preprocess the sentence # Preprocess the sentence
@ -91,55 +84,22 @@ class KeywordsPage(tk.Frame):
cleaned_data.append(lemmatized_sentence) cleaned_data.append(lemmatized_sentence)
"""Setting tf-idf and NMF variables""" """Setting tf-idf variables"""
n_samples = len(cleaned_data) n_samples = len(cleaned_data)
n_features = 20 n_features = 20
n_components = 10
n_top_words = 10
batch_size = 128
init = "nndsvda"
"""Use tf-idf features for NMF"""
data_samples = cleaned_data[:n_samples] data_samples = cleaned_data[:n_samples]
tfidf_vectorizer = TfidfVectorizer( tfidf_vectorizer = TfidfVectorizer(
max_df=0.30, max_df=0.30,
min_df=2, min_df=2,
max_features=n_features, max_features=n_features,
stop_words="english" stop_words="english"
) )
tfidf = tfidf_vectorizer.fit_transform(data_samples) tfidf = tfidf_vectorizer.fit_transform(data_samples)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
return sorted(tfidf_feature_names)
#print("TF-IDF Feature names: ", tfidf_feature_names)
# nmf = NMF(
# n_components=n_components,
# max_iter=n_samples,
# #tol=1e-4,
# random_state=1,
# init=init,
# beta_loss="frobenius",
# alpha_W=0.00005, #directory theres run command dash, file way root theres echo, hello echo example say run, just run say things dash, know things command say theres, like things example theres way, program run echo way say, shell run things way command, use way things dash command, want root say example command
# alpha_H=0.00005,
# #alpha_W=0, # directory theres run command dash, file way root theres echo, hello echo example say run, just run say things dash, know things command say theres, like things example theres way, program run echo way say, shell run things way command, use way things dash command, want root say example command
# #alpha_H=0,
# l1_ratio=1,
# ).fit(tfidf)
topics_list = []
# Collect the top words for each topic
#for topic_idx, topic in enumerate(nmf.components_):
# Get the top 5 words for this topic
# top_words = [tfidf_feature_names[i] for i in topic.argsort()[:-5 - 1:-1]]
# Convert the list of top words to a string and add to the topics list
# topics_list.append(" ".join(top_words))
topics = set(topics_list) # Naive splitting by spaces
#return sorted(topics) # Return a sorted list of unique words
return sorted(tfidf_feature_names) # Return a sorted list of unique words
def extract_text_in_asterisks(text):
pattern = r'\*\*(.*?)\*\*'
matches = re.findall(pattern, text)
return ",\n".join(matches)

View file

@ -3,6 +3,7 @@ from llama_cpp import Llama
import tkinter as tk import tkinter as tk
from tkinter import ttk from tkinter import ttk
import threading import threading
from tkinter import filedialog
class LlmPage(tk.Frame): class LlmPage(tk.Frame):
def __init__(self, parent, app_data): def __init__(self, parent, app_data):
@ -10,74 +11,43 @@ class LlmPage(tk.Frame):
self.app = parent self.app = parent
self.app_data = app_data self.app_data = app_data
tk.Label(self, text="Notes", font=self.app_data.heading_font).grid(row=0, column=0, sticky="ew", pady=2, padx=2) tk.Label(self, text="Notes", font=self.app_data.heading_font).grid(row=0, column=0, sticky="ew", pady=2, padx=2)
tk.Label(self, text="Press the generate button and wait for your notes to generate.", font=self.app_data.paragraph_font, wraplength=400, justify="left").grid(row=1, column=0, sticky="ew", pady=2, padx=2) tk.Label(self, text="Press the button bellow to select your LLM and generate notes according to the previously defined keywods (tested on gpt4all-falcon-newbpe-q4_0.gguf) other model output object might not match the display function", font=self.app_data.paragraph_font, wraplength=800, justify="left").grid(row=1, column=0, sticky="ew", pady=2, padx=2)
self.text_widget = tk.Text(self, font=self.app_data.paragraph_font, wrap="word") self.text_widget = tk.Text(self, font=self.app_data.paragraph_font, wrap="word")
self.text_widget.grid(row=2, column=0, sticky="nsew", pady=6, padx=6) self.text_widget.grid(row=3, column=0, sticky="nsew", pady=6, padx=6)
self.start_button = ttk.Button(self, text="Start Operation", command=self.start_llama_operation) self.start_button = ttk.Button(self, text="Select LLM and generate notes", command=self.start_llama_thread)
self.start_button.grid(row=3, column=0, sticky="ew", pady=2, padx=2) self.start_button.grid(row=4, column=0, pady=2, padx=2)
def start_llama_operation(self): self.save_button = tk.Button(self, text="Save Notes", command=self.save_notes)
# LLM MODEL FILE SELECTOR
def browse_file(self):
llm_model_filename = filedialog.askopenfilename(initialdir="./", title="Select your LLM (GGUF format)", filetypes=(("LLM binary file", "*.gguf"), ("All files", "*.*")))
if llm_model_filename:
return llm_model_filename
def start_llama_thread(self):
if self.app_data.modified_keywords is None: if self.app_data.modified_keywords is None:
self.text_widget.insert(tk.END, "Keywords have not been set.") self.text_widget.insert(tk.END, "Keywords have not been set.")
else: else:
self.text_widget.delete('1.0', tk.END) self.text_widget.delete('1.0', tk.END)
self.text_widget.insert(tk.END, "Please wait...") self.text_widget.insert(tk.END, "Generating, Please wait...")
operation_thread = threading.Thread(target=self.run_llama_operation, args=(self.app_data.modified_keywords,)) # Pass data explicitly operation_thread = threading.Thread(target=self.run_llama_operation, args=(self.app_data.modified_keywords,self.app_data.highlights,)) # Pass data explicitly
operation_thread.start() operation_thread.start()
def run_llama_operation(self, llmTopics): def run_llama_operation(self, llmTopics, highlights):
try: try:
# Example: Llama class must be imported correctly here
llm = Llama(model_path="##############", n_ctx=2048, ) llm = Llama(model_path=self.browse_file(), n_ctx=2048,)
# output = llm.create_chat_completion(
# messages=[
# {"role": "system", "content": "You are a teacher explaining in great detail given topics divided by new line."},
# {"role": "user", "content": llmTopics} # Use local variable passed to thread
# ]
# )
output = llm( output = llm(
f"Genereate comprehensive, informative and factual descriptions for the provided keywords '{llmTopics}", # Prompt prompt=f"Genereate comprehensive, informative and factual descriptions for the provided keywords '{llmTopics}'.", # Prompt
max_tokens=0, # Generate up to 32 tokens, set to None to generate up to the end of the context window max_tokens=0,
) )
"""Generate text from a prompt.
Args:
prompt: The prompt to generate text from.
suffix: A suffix to append to the generated text. If None, no suffix is appended.
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
temperature: The temperature to use for sampling.
top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
logprobs: The number of logprobs to return. If None, no logprobs are returned.
echo: Whether to echo the prompt.
stop: A list of strings to stop generation when encountered.
frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
repeat_penalty: The penalty to apply to repeated tokens.
top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
stream: Whether to stream the results.
seed: The seed to use for sampling.
tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
mirostat_mode: The mirostat sampling mode.
mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
model: The name to use for the model in the completion object.
stopping_criteria: A list of stopping criteria to use.
logits_processor: A list of logits processors to use.
grammar: A grammar to use for constrained sampling.
logit_bias: A logit bias to use.
Raises:
ValueError: If the requested tokens exceed the context window.
RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
Returns:
Response object containing the generated text.
"""
self.text_widget.after(0, self.update_text_widget, output['choices'][0]) self.text_widget.after(0, self.update_text_widget, output['choices'][0])
self.save_button.grid(row=5, column=0, pady=2, padx=2)
except Exception as e: except Exception as e:
print(f"Error during Llama operation: {e}") print(f"Error during Llama operation: {e}")
self.text_widget.after(0, self.update_text_widget, "An error occurred, please try again.") self.text_widget.after(0, self.update_text_widget, "An error occurred, please try again.")
@ -86,3 +56,10 @@ class LlmPage(tk.Frame):
if self.winfo_exists(): if self.winfo_exists():
self.text_widget.delete('1.0', tk.END) self.text_widget.delete('1.0', tk.END)
self.text_widget.insert(tk.END, content) self.text_widget.insert(tk.END, content)
def save_notes(self):
text = self.text_widget.get("1.0", "end-1c") # Get all text from the textarea
filename = f"recordings/notes_{self.app_data.lecture_filename}.txt"
with open(filename, "w") as file:
file.write(text)

View file

@ -1,5 +1,6 @@
import tkinter as tk import tkinter as tk
from tkinter import ttk from tkinter import ttk
import webbrowser
from datetime import datetime from datetime import datetime
import sounddevice as sd import sounddevice as sd
import soundfile as sf import soundfile as sf
@ -24,7 +25,6 @@ class StartPage(tk.Frame):
all_devices = sd.query_devices() all_devices = sd.query_devices()
input_devices = {all_devices[i]['name']: i for i in range(len(all_devices)) if all_devices[i]['max_input_channels'] > 0} input_devices = {all_devices[i]['name']: i for i in range(len(all_devices)) if all_devices[i]['max_input_channels'] > 0}
# Dropdown for device selection
self.device_var = tk.StringVar() self.device_var = tk.StringVar()
device_names = list(input_devices.keys()) device_names = list(input_devices.keys())
self.device_menu = ttk.Combobox(self, values=device_names, textvariable=self.device_var) self.device_menu = ttk.Combobox(self, values=device_names, textvariable=self.device_var)
@ -35,11 +35,23 @@ class StartPage(tk.Frame):
ttk.Button(self, text="Start Recording", command=self.start_recording).pack(pady=5) ttk.Button(self, text="Start Recording", command=self.start_recording).pack(pady=5)
ttk.Button(self, text="Stop Recording", command=self.stop_recording).pack(pady=5) ttk.Button(self, text="Stop Recording", command=self.stop_recording).pack(pady=5)
ttk.Button(self, text="Skip Recording", command=self.skip_recording_page).pack(pady=5) ttk.Button(self, text="Skip Recording", command=self.skip_recording_page).pack(pady=5)
# Recording indicator # Recording indicator
self.recording_indicator = tk.Label(self, text="Recording: OFF", fg="red") self.recording_indicator = tk.Label(self, text="Recording: OFF", fg="red")
self.recording_indicator.pack(pady=5) self.recording_indicator.pack(pady=5)
def open_license_dialog():
webbrowser.open("https://www.gnu.org/licenses/gpl.html")
label_link_license = tk.Label(self, text="click here for details. https://www.gnu.org/licenses/gpl.html", fg="blue", cursor="hand2")
label_link_license.pack(side="bottom")
label_link_license.bind("<Button-1>", lambda event: open_license_dialog())
label_license = tk.Label(self, text="Lecture Summarizer Copyright (C) 2024 Martin Jaros\nThis program comes with ABSOLUTELY NO WARRANTY;\nThis is free software, and you are welcome to redistribute it under certain conditions;")
label_license.pack(side="bottom")
def start_recording(self): def start_recording(self):
if not self.recording: if not self.recording:
self.recording = True self.recording = True
@ -79,7 +91,6 @@ class StartPage(tk.Frame):
def update_recording_indicator(self, is_recording): def update_recording_indicator(self, is_recording):
#"""Update the recording indicator based on the recording state."""
if is_recording: if is_recording:
self.recording_indicator.config(text="Recording: ON", fg="green") self.recording_indicator.config(text="Recording: ON", fg="green")
else: else:

View file

@ -59,12 +59,10 @@ class TranscriptionPage(tk.Frame):
query = self.search_box.get() query = self.search_box.get()
if not query: if not query:
return return
# Starting position for the search (insert cursor position) # Starting position for the search (insert cursor position)
start_pos = self.transcription_textarea.index(tk.INSERT) start_pos = self.transcription_textarea.index(tk.INSERT)
# Search for the query in the text area # Search for the query in the text area
pos = self.transcription_textarea.search(query, start_pos, tk.END) pos = self.transcription_textarea.search(query, start_pos, tk.END)
if pos: if pos:
# If found, move cursor to the start of the found text and select the text # If found, move cursor to the start of the found text and select the text
end_pos = f"{pos}+{len(query)}c" # Calculate end position of the selection end_pos = f"{pos}+{len(query)}c" # Calculate end position of the selection
@ -72,13 +70,11 @@ class TranscriptionPage(tk.Frame):
self.transcription_textarea.tag_add(tk.SEL, pos, end_pos) self.transcription_textarea.tag_add(tk.SEL, pos, end_pos)
self.transcription_textarea.mark_set(tk.INSERT, pos) self.transcription_textarea.mark_set(tk.INSERT, pos)
self.transcription_textarea.see(pos) self.transcription_textarea.see(pos)
# Hide the search box after search # Hide the search box after search
self.search_box.grid_remove() self.search_box.grid_remove()
self.search_tooltip.grid_remove() self.search_tooltip.grid_remove()
def insert_into_textarea(self, transcription): def insert_into_textarea(self, transcription):
"""Insert transcription text into the Text widget.""" """Insert transcription text into the Text widget."""
def update_text(): def update_text():
@ -107,18 +103,14 @@ class TranscriptionPage(tk.Frame):
whisper_model = whisper.load_model("small", device=hw_device) whisper_model = whisper.load_model("small", device=hw_device)
device_label = tk.Label(self, text="Loaded Whisper on: " + hw_device, font=self.app_data.mono_font) device_label = tk.Label(self, text="Loaded Whisper on: " + hw_device, font=self.app_data.mono_font)
device_label.grid(row=2, column=0, pady=4) device_label.grid(row=2, column=0, pady=4)
transcription_text = whisper_model.transcribe(audio_filepath) transcription_text = whisper_model.transcribe(audio_filepath)
self.insert_into_textarea(transcription_text['text']) self.insert_into_textarea(transcription_text['text'])
# Collect garbage to free up memory (doesn't seem to work) # Collect garbage to free up memory (doesn't seem to work)
del whisper_model del whisper_model
if (hw_device=='cuda'): if (hw_device=='cuda'):
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
except Exception as e: except Exception as e:
print(f"Error during transcription: {e}") print(f"Error during transcription: {e}")
self.insert_into_textarea("Failed to transcribe audio.") self.insert_into_textarea("Failed to transcribe audio.")

View file

@ -6,6 +6,7 @@ class AppData:
def __init__(self): def __init__(self):
self._lecture_filename = None self._lecture_filename = None
self._modified_keywords = None self._modified_keywords = None
highlights = None
@property @property
def lecture_filename(self): def lecture_filename(self):
@ -22,16 +23,12 @@ class AppData:
@modified_keywords.setter @modified_keywords.setter
def modified_keywords(self, value): def modified_keywords(self, value):
self._modified_keywords = value self._modified_keywords = value
heading_font = ("DejaVu Sans", 20, "bold") heading_font = ("DejaVu Sans", 20, "bold")
paragraph_font = ("DejaVu Sans", 12) paragraph_font = ("DejaVu Sans", 12)
serif_font = ("DejaVu Serif", 12) serif_font = ("DejaVu Serif", 12)
mono_font = ("Courier", 11) mono_font = ("Courier", 11)
# class AppData:
# lecture_filename = None
# modified_keywords = None
class App(tk.Tk): class App(tk.Tk):
def __init__(self): def __init__(self):