llm selection, notes saving, cleanup
This commit is contained in:
parent
ee19d52cb7
commit
44a82f4af8
13 changed files with 910 additions and 130 deletions
17
Dockerfile
Normal file
17
Dockerfile
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
FROM python:3.10.12
|
||||
|
||||
WORKDIR /LectureSummarizer
|
||||
|
||||
COPY . /LectureSummarizer
|
||||
|
||||
RUN pip install --no-cache-dir \
|
||||
sounddevice \
|
||||
soundfile \
|
||||
numpy \
|
||||
openai-whisper \
|
||||
torch \
|
||||
scikit-learn \
|
||||
nltk \
|
||||
llama-cpp-python
|
||||
|
||||
CMD ["python", "main.py"]
|
||||
817
RAG.ipynb
Normal file
817
RAG.ipynb
Normal file
File diff suppressed because one or more lines are too long
13
README.md
13
README.md
|
|
@ -1,7 +1,7 @@
|
|||
# Lecture Summarizer
|
||||
|
||||
## Description
|
||||
Tkinter application to record text with openai-whisper, extract keyword from transcription with sklearn's TF-IDF and generate notes with gguf llm model
|
||||
Tkinter application to record text with openai-whisper, extract keywords from transcription with sklearn's TF-IDF and generate notes with gguf llm model
|
||||
|
||||
## Requirements
|
||||
Python 3.10.12
|
||||
|
|
@ -16,20 +16,29 @@ Clone the repository:
|
|||
```bash
|
||||
git clone https://github.com/JRoshthen1/LectureSummarizer.git
|
||||
```
|
||||
|
||||
Navigate to the project directory:
|
||||
|
||||
```bash
|
||||
cd LectureSummarizer/
|
||||
```
|
||||
|
||||
Install dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install tkinter sounddevice soundfile numpy openai-whisper torch scikit-learn nltk llama-cpp-python
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Create `recordings` directory in the root of the application (./recordings/)
|
||||
|
||||
2. Download a gguf model for text generation, [examples](https://huggingface.co/models?library=gguf)
|
||||
|
||||
3. Run the app
|
||||
```bash
|
||||
python3 main.py
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -3,8 +3,7 @@ from tkinter import ttk
|
|||
from nltk.stem import WordNetLemmatizer
|
||||
from datetime import datetime
|
||||
import re
|
||||
#from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
|
||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from nltk.tokenize import word_tokenize
|
||||
from .llm_page import LlmPage
|
||||
|
||||
|
|
@ -17,7 +16,6 @@ class KeywordsPage(tk.Frame):
|
|||
self.grid_columnconfigure(0, weight=1)
|
||||
self.grid_rowconfigure(4, weight=1)
|
||||
self.keywords_description = 'Here are the extracted keywords. You can modify them to your liking before feeding them into note generation. Keywords will be added to the top of the transcription file.'
|
||||
|
||||
|
||||
tk.Label(self, text="Keywords", font=self.app_data.heading_font).grid(row=0, column=0, columnspan=2)
|
||||
|
||||
|
|
@ -28,18 +26,15 @@ class KeywordsPage(tk.Frame):
|
|||
|
||||
self.keywords_textarea = tk.Text(self, wrap="word", font=self.app_data.paragraph_font)
|
||||
self.keywords_textarea.grid(row=3, column=0, sticky="nsew", pady=6, padx=6)
|
||||
|
||||
keywords_scrollbar = tk.Scrollbar(self, command=self.keywords_textarea.yview)
|
||||
keywords_scrollbar.grid(row=3, column=1, sticky="ns")
|
||||
self.keywords_textarea.config(yscrollcommand=keywords_scrollbar.set)
|
||||
|
||||
tk.Button(self, text="Generate Notes", command=self.write_kw_and_forward_to_llm_page).grid(row=4, column=0, columnspan=2, pady=5)
|
||||
|
||||
|
||||
def write_kw_and_forward_to_llm_page(self):
|
||||
self.modified_keywords = self.keywords_textarea.get('1.0', tk.END)
|
||||
self.app_data.modified_keywords = self.modified_keywords
|
||||
#print(self.app_data.modified_keywords)
|
||||
keywords = f"Transcription keywords:\n\n{self.modified_keywords}\n"
|
||||
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
|
||||
with open(filename, 'r') as file:
|
||||
|
|
@ -50,9 +45,10 @@ class KeywordsPage(tk.Frame):
|
|||
self.app.show_frame(LlmPage)
|
||||
|
||||
|
||||
|
||||
def start_kw_extraction_process(self, transcription_text):
|
||||
# Save the transcription to a text file
|
||||
if (self.app_data.lecture_filename == None):
|
||||
if (self.app_data.lecture_filename == None):
|
||||
|
||||
date_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self.app_data.lecture_filename = f"lecture_{date_time_str}"
|
||||
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
|
||||
|
|
@ -60,9 +56,11 @@ class KeywordsPage(tk.Frame):
|
|||
else:
|
||||
filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
|
||||
self.lecture_filename_label.config(text=filename)
|
||||
|
||||
extracted_texts = extract_text_in_asterisks(transcription_text)
|
||||
highlighted_transcription = extracted_texts + "\n\n" + transcription_text
|
||||
self.app_data.highlights = extracted_texts
|
||||
with open(filename, "w") as file:
|
||||
file.write(transcription_text)
|
||||
file.write(highlighted_transcription)
|
||||
|
||||
# Extract the keywords
|
||||
keywords = self.extract_topics(transcription_text)
|
||||
|
|
@ -72,13 +70,8 @@ class KeywordsPage(tk.Frame):
|
|||
def extract_topics(self, transcript):
|
||||
"""Lemmatizing words into their simplest form"""
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
|
||||
# Split transcript into sentences
|
||||
sentences = re.split(r'[.!?]', transcript)
|
||||
|
||||
# Initialize list to store lemmatized data
|
||||
cleaned_data = []
|
||||
|
||||
# Preprocess and lemmatize each sentence
|
||||
for sentence in sentences:
|
||||
# Preprocess the sentence
|
||||
|
|
@ -91,55 +84,22 @@ class KeywordsPage(tk.Frame):
|
|||
cleaned_data.append(lemmatized_sentence)
|
||||
|
||||
|
||||
"""Setting tf-idf and NMF variables"""
|
||||
"""Setting tf-idf variables"""
|
||||
n_samples = len(cleaned_data)
|
||||
n_features = 20
|
||||
n_components = 10
|
||||
n_top_words = 10
|
||||
batch_size = 128
|
||||
init = "nndsvda"
|
||||
|
||||
|
||||
"""Use tf-idf features for NMF"""
|
||||
data_samples = cleaned_data[:n_samples]
|
||||
tfidf_vectorizer = TfidfVectorizer(
|
||||
max_df=0.30,
|
||||
min_df=2,
|
||||
max_features=n_features,
|
||||
stop_words="english"
|
||||
max_df=0.30,
|
||||
min_df=2,
|
||||
max_features=n_features,
|
||||
stop_words="english"
|
||||
)
|
||||
|
||||
tfidf = tfidf_vectorizer.fit_transform(data_samples)
|
||||
|
||||
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
|
||||
|
||||
#print("TF-IDF Feature names: ", tfidf_feature_names)
|
||||
|
||||
|
||||
# nmf = NMF(
|
||||
# n_components=n_components,
|
||||
# max_iter=n_samples,
|
||||
# #tol=1e-4,
|
||||
# random_state=1,
|
||||
# init=init,
|
||||
# beta_loss="frobenius",
|
||||
# alpha_W=0.00005, #directory theres run command dash, file way root theres echo, hello echo example say run, just run say things dash, know things command say theres, like things example theres way, program run echo way say, shell run things way command, use way things dash command, want root say example command
|
||||
# alpha_H=0.00005,
|
||||
# #alpha_W=0, # directory theres run command dash, file way root theres echo, hello echo example say run, just run say things dash, know things command say theres, like things example theres way, program run echo way say, shell run things way command, use way things dash command, want root say example command
|
||||
# #alpha_H=0,
|
||||
# l1_ratio=1,
|
||||
# ).fit(tfidf)
|
||||
|
||||
topics_list = []
|
||||
|
||||
# Collect the top words for each topic
|
||||
#for topic_idx, topic in enumerate(nmf.components_):
|
||||
# Get the top 5 words for this topic
|
||||
# top_words = [tfidf_feature_names[i] for i in topic.argsort()[:-5 - 1:-1]]
|
||||
# Convert the list of top words to a string and add to the topics list
|
||||
# topics_list.append(" ".join(top_words))
|
||||
|
||||
topics = set(topics_list) # Naive splitting by spaces
|
||||
#return sorted(topics) # Return a sorted list of unique words
|
||||
return sorted(tfidf_feature_names) # Return a sorted list of unique words
|
||||
return sorted(tfidf_feature_names)
|
||||
|
||||
def extract_text_in_asterisks(text):
|
||||
pattern = r'\*\*(.*?)\*\*'
|
||||
matches = re.findall(pattern, text)
|
||||
return ",\n".join(matches)
|
||||
|
|
@ -3,6 +3,7 @@ from llama_cpp import Llama
|
|||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
import threading
|
||||
from tkinter import filedialog
|
||||
|
||||
class LlmPage(tk.Frame):
|
||||
def __init__(self, parent, app_data):
|
||||
|
|
@ -10,74 +11,43 @@ class LlmPage(tk.Frame):
|
|||
self.app = parent
|
||||
self.app_data = app_data
|
||||
tk.Label(self, text="Notes", font=self.app_data.heading_font).grid(row=0, column=0, sticky="ew", pady=2, padx=2)
|
||||
tk.Label(self, text="Press the generate button and wait for your notes to generate.", font=self.app_data.paragraph_font, wraplength=400, justify="left").grid(row=1, column=0, sticky="ew", pady=2, padx=2)
|
||||
tk.Label(self, text="Press the button bellow to select your LLM and generate notes according to the previously defined keywods (tested on gpt4all-falcon-newbpe-q4_0.gguf) other model output object might not match the display function", font=self.app_data.paragraph_font, wraplength=800, justify="left").grid(row=1, column=0, sticky="ew", pady=2, padx=2)
|
||||
|
||||
|
||||
self.text_widget = tk.Text(self, font=self.app_data.paragraph_font, wrap="word")
|
||||
self.text_widget.grid(row=2, column=0, sticky="nsew", pady=6, padx=6)
|
||||
self.text_widget.grid(row=3, column=0, sticky="nsew", pady=6, padx=6)
|
||||
|
||||
self.start_button = ttk.Button(self, text="Start Operation", command=self.start_llama_operation)
|
||||
self.start_button.grid(row=3, column=0, sticky="ew", pady=2, padx=2)
|
||||
self.start_button = ttk.Button(self, text="Select LLM and generate notes", command=self.start_llama_thread)
|
||||
self.start_button.grid(row=4, column=0, pady=2, padx=2)
|
||||
|
||||
def start_llama_operation(self):
|
||||
self.save_button = tk.Button(self, text="Save Notes", command=self.save_notes)
|
||||
# LLM MODEL FILE SELECTOR
|
||||
def browse_file(self):
|
||||
llm_model_filename = filedialog.askopenfilename(initialdir="./", title="Select your LLM (GGUF format)", filetypes=(("LLM binary file", "*.gguf"), ("All files", "*.*")))
|
||||
if llm_model_filename:
|
||||
return llm_model_filename
|
||||
|
||||
def start_llama_thread(self):
|
||||
if self.app_data.modified_keywords is None:
|
||||
self.text_widget.insert(tk.END, "Keywords have not been set.")
|
||||
else:
|
||||
self.text_widget.delete('1.0', tk.END)
|
||||
self.text_widget.insert(tk.END, "Please wait...")
|
||||
self.text_widget.insert(tk.END, "Generating, Please wait...")
|
||||
|
||||
operation_thread = threading.Thread(target=self.run_llama_operation, args=(self.app_data.modified_keywords,)) # Pass data explicitly
|
||||
operation_thread = threading.Thread(target=self.run_llama_operation, args=(self.app_data.modified_keywords,self.app_data.highlights,)) # Pass data explicitly
|
||||
operation_thread.start()
|
||||
|
||||
def run_llama_operation(self, llmTopics):
|
||||
def run_llama_operation(self, llmTopics, highlights):
|
||||
try:
|
||||
# Example: Llama class must be imported correctly here
|
||||
llm = Llama(model_path="##############", n_ctx=2048, )
|
||||
# output = llm.create_chat_completion(
|
||||
# messages=[
|
||||
# {"role": "system", "content": "You are a teacher explaining in great detail given topics divided by new line."},
|
||||
# {"role": "user", "content": llmTopics} # Use local variable passed to thread
|
||||
# ]
|
||||
# )
|
||||
|
||||
llm = Llama(model_path=self.browse_file(), n_ctx=2048,)
|
||||
output = llm(
|
||||
f"Genereate comprehensive, informative and factual descriptions for the provided keywords '{llmTopics}", # Prompt
|
||||
max_tokens=0, # Generate up to 32 tokens, set to None to generate up to the end of the context window
|
||||
|
||||
prompt=f"Genereate comprehensive, informative and factual descriptions for the provided keywords '{llmTopics}'.", # Prompt
|
||||
max_tokens=0,
|
||||
)
|
||||
"""Generate text from a prompt.
|
||||
Args:
|
||||
prompt: The prompt to generate text from.
|
||||
suffix: A suffix to append to the generated text. If None, no suffix is appended.
|
||||
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
|
||||
temperature: The temperature to use for sampling.
|
||||
top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
||||
typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
logprobs: The number of logprobs to return. If None, no logprobs are returned.
|
||||
echo: Whether to echo the prompt.
|
||||
stop: A list of strings to stop generation when encountered.
|
||||
frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
|
||||
presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
|
||||
repeat_penalty: The penalty to apply to repeated tokens.
|
||||
top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
stream: Whether to stream the results.
|
||||
seed: The seed to use for sampling.
|
||||
tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
mirostat_mode: The mirostat sampling mode.
|
||||
mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
model: The name to use for the model in the completion object.
|
||||
stopping_criteria: A list of stopping criteria to use.
|
||||
logits_processor: A list of logits processors to use.
|
||||
grammar: A grammar to use for constrained sampling.
|
||||
logit_bias: A logit bias to use.
|
||||
|
||||
Raises:
|
||||
ValueError: If the requested tokens exceed the context window.
|
||||
RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
|
||||
|
||||
Returns:
|
||||
Response object containing the generated text.
|
||||
"""
|
||||
self.text_widget.after(0, self.update_text_widget, output['choices'][0])
|
||||
self.save_button.grid(row=5, column=0, pady=2, padx=2)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during Llama operation: {e}")
|
||||
self.text_widget.after(0, self.update_text_widget, "An error occurred, please try again.")
|
||||
|
|
@ -86,3 +56,10 @@ class LlmPage(tk.Frame):
|
|||
if self.winfo_exists():
|
||||
self.text_widget.delete('1.0', tk.END)
|
||||
self.text_widget.insert(tk.END, content)
|
||||
|
||||
def save_notes(self):
|
||||
text = self.text_widget.get("1.0", "end-1c") # Get all text from the textarea
|
||||
filename = f"recordings/notes_{self.app_data.lecture_filename}.txt"
|
||||
with open(filename, "w") as file:
|
||||
file.write(text)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
import webbrowser
|
||||
from datetime import datetime
|
||||
import sounddevice as sd
|
||||
import soundfile as sf
|
||||
|
|
@ -24,7 +25,6 @@ class StartPage(tk.Frame):
|
|||
all_devices = sd.query_devices()
|
||||
input_devices = {all_devices[i]['name']: i for i in range(len(all_devices)) if all_devices[i]['max_input_channels'] > 0}
|
||||
|
||||
# Dropdown for device selection
|
||||
self.device_var = tk.StringVar()
|
||||
device_names = list(input_devices.keys())
|
||||
self.device_menu = ttk.Combobox(self, values=device_names, textvariable=self.device_var)
|
||||
|
|
@ -35,11 +35,23 @@ class StartPage(tk.Frame):
|
|||
ttk.Button(self, text="Start Recording", command=self.start_recording).pack(pady=5)
|
||||
ttk.Button(self, text="Stop Recording", command=self.stop_recording).pack(pady=5)
|
||||
ttk.Button(self, text="Skip Recording", command=self.skip_recording_page).pack(pady=5)
|
||||
|
||||
|
||||
# Recording indicator
|
||||
self.recording_indicator = tk.Label(self, text="Recording: OFF", fg="red")
|
||||
self.recording_indicator.pack(pady=5)
|
||||
|
||||
|
||||
def open_license_dialog():
|
||||
webbrowser.open("https://www.gnu.org/licenses/gpl.html")
|
||||
label_link_license = tk.Label(self, text="click here for details. https://www.gnu.org/licenses/gpl.html", fg="blue", cursor="hand2")
|
||||
label_link_license.pack(side="bottom")
|
||||
label_link_license.bind("<Button-1>", lambda event: open_license_dialog())
|
||||
label_license = tk.Label(self, text="Lecture Summarizer Copyright (C) 2024 Martin Jaros\nThis program comes with ABSOLUTELY NO WARRANTY;\nThis is free software, and you are welcome to redistribute it under certain conditions;")
|
||||
label_license.pack(side="bottom")
|
||||
|
||||
|
||||
|
||||
|
||||
def start_recording(self):
|
||||
if not self.recording:
|
||||
self.recording = True
|
||||
|
|
@ -79,7 +91,6 @@ class StartPage(tk.Frame):
|
|||
|
||||
|
||||
def update_recording_indicator(self, is_recording):
|
||||
#"""Update the recording indicator based on the recording state."""
|
||||
if is_recording:
|
||||
self.recording_indicator.config(text="Recording: ON", fg="green")
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -59,12 +59,10 @@ class TranscriptionPage(tk.Frame):
|
|||
query = self.search_box.get()
|
||||
if not query:
|
||||
return
|
||||
|
||||
# Starting position for the search (insert cursor position)
|
||||
start_pos = self.transcription_textarea.index(tk.INSERT)
|
||||
# Search for the query in the text area
|
||||
pos = self.transcription_textarea.search(query, start_pos, tk.END)
|
||||
|
||||
if pos:
|
||||
# If found, move cursor to the start of the found text and select the text
|
||||
end_pos = f"{pos}+{len(query)}c" # Calculate end position of the selection
|
||||
|
|
@ -72,13 +70,11 @@ class TranscriptionPage(tk.Frame):
|
|||
self.transcription_textarea.tag_add(tk.SEL, pos, end_pos)
|
||||
self.transcription_textarea.mark_set(tk.INSERT, pos)
|
||||
self.transcription_textarea.see(pos)
|
||||
|
||||
# Hide the search box after search
|
||||
self.search_box.grid_remove()
|
||||
self.search_tooltip.grid_remove()
|
||||
|
||||
|
||||
|
||||
def insert_into_textarea(self, transcription):
|
||||
"""Insert transcription text into the Text widget."""
|
||||
def update_text():
|
||||
|
|
@ -107,18 +103,14 @@ class TranscriptionPage(tk.Frame):
|
|||
whisper_model = whisper.load_model("small", device=hw_device)
|
||||
device_label = tk.Label(self, text="Loaded Whisper on: " + hw_device, font=self.app_data.mono_font)
|
||||
device_label.grid(row=2, column=0, pady=4)
|
||||
|
||||
transcription_text = whisper_model.transcribe(audio_filepath)
|
||||
self.insert_into_textarea(transcription_text['text'])
|
||||
|
||||
|
||||
# Collect garbage to free up memory (doesn't seem to work)
|
||||
del whisper_model
|
||||
if (hw_device=='cuda'):
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during transcription: {e}")
|
||||
self.insert_into_textarea("Failed to transcribe audio.")
|
||||
|
|
|
|||
7
main.py
7
main.py
|
|
@ -6,6 +6,7 @@ class AppData:
|
|||
def __init__(self):
|
||||
self._lecture_filename = None
|
||||
self._modified_keywords = None
|
||||
highlights = None
|
||||
|
||||
@property
|
||||
def lecture_filename(self):
|
||||
|
|
@ -22,16 +23,12 @@ class AppData:
|
|||
@modified_keywords.setter
|
||||
def modified_keywords(self, value):
|
||||
self._modified_keywords = value
|
||||
|
||||
|
||||
heading_font = ("DejaVu Sans", 20, "bold")
|
||||
paragraph_font = ("DejaVu Sans", 12)
|
||||
serif_font = ("DejaVu Serif", 12)
|
||||
mono_font = ("Courier", 11)
|
||||
|
||||
# class AppData:
|
||||
# lecture_filename = None
|
||||
# modified_keywords = None
|
||||
|
||||
|
||||
class App(tk.Tk):
|
||||
def __init__(self):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue