llm selection, notes saving, cleanup

2024-05-02 00:12:45 +02:00 · 2024-05-02 00:12:45 +02:00 · 44a82f4af8
commit 44a82f4af8
parent ee19d52cb7
13 changed files with 910 additions and 130 deletions
--- a/17
+++ b/17
@ -0,0 +1,17 @@
+FROM python:3.10.12
+
+WORKDIR /LectureSummarizer
+
+COPY . /LectureSummarizer
+
+RUN pip install --no-cache-dir \
+    sounddevice \
+    soundfile \
+    numpy \
+    openai-whisper \
+    torch \
+    scikit-learn \
+    nltk \
+    llama-cpp-python
+
+CMD ["python", "main.py"]
--- a/RAG.ipynb
+++ b/RAG.ipynb
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # Lecture Summarizer

 ## Description
-Tkinter application to record text with openai-whisper, extract keyword from transcription with sklearn's TF-IDF and generate notes with gguf llm model 
+Tkinter application to record text with openai-whisper, extract keywords from transcription with sklearn's TF-IDF and generate notes with gguf llm model 

 ## Requirements
 Python 3.10.12
@ -16,20 +16,29 @@ Clone the repository:
 ```bash
 git clone https://github.com/JRoshthen1/LectureSummarizer.git
 ```
+
 Navigate to the project directory:

 ```bash
 cd LectureSummarizer/
 ```
+
 Install dependencies:

 ```bash
-pip install -r requirements.txt
+pip install tkinter sounddevice soundfile numpy openai-whisper torch scikit-learn nltk llama-cpp-python
 ```

 ## Usage

+1. Create `recordings` directory in the root of the application (./recordings/)

+2. Download a gguf model for text generation, [examples](https://huggingface.co/models?library=gguf)
+
+3. Run the app
+```bash
+python3 main.py
+```

 ## Contributing

--- a/lsb_package/pycache/init.cpython-310.pyc
+++ b/lsb_package/pycache/init.cpython-310.pyc
--- a/lsb_package/pycache/keywords_page.cpython-310.pyc
+++ b/lsb_package/pycache/keywords_page.cpython-310.pyc
--- a/lsb_package/pycache/llm_page.cpython-310.pyc
+++ b/lsb_package/pycache/llm_page.cpython-310.pyc
--- a/lsb_package/pycache/start_page.cpython-310.pyc
+++ b/lsb_package/pycache/start_page.cpython-310.pyc
--- a/lsb_package/pycache/transcription_page.cpython-310.pyc
+++ b/lsb_package/pycache/transcription_page.cpython-310.pyc
--- a/lsb_package/keywords_page.py
+++ b/lsb_package/keywords_page.py
@ -3,8 +3,7 @@ from tkinter import ttk
 from nltk.stem import WordNetLemmatizer
 from datetime import datetime
 import re
-#from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
-from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
 from nltk.tokenize import word_tokenize
 from .llm_page import LlmPage

@ -17,7 +16,6 @@ class KeywordsPage(tk.Frame):
        self.grid_columnconfigure(0, weight=1)
        self.grid_rowconfigure(4, weight=1)
        self.keywords_description = 'Here are the extracted keywords. You can modify them to your liking before feeding them into note generation. Keywords will be added to the top of the transcription file.'
-
        
        tk.Label(self, text="Keywords", font=self.app_data.heading_font).grid(row=0, column=0, columnspan=2)
        
@ -28,18 +26,15 @@ class KeywordsPage(tk.Frame):

        self.keywords_textarea = tk.Text(self, wrap="word", font=self.app_data.paragraph_font)
        self.keywords_textarea.grid(row=3, column=0, sticky="nsew", pady=6, padx=6)
-
        keywords_scrollbar = tk.Scrollbar(self, command=self.keywords_textarea.yview)
        keywords_scrollbar.grid(row=3, column=1, sticky="ns")
        self.keywords_textarea.config(yscrollcommand=keywords_scrollbar.set)
-
        tk.Button(self, text="Generate Notes", command=self.write_kw_and_forward_to_llm_page).grid(row=4, column=0, columnspan=2, pady=5)


    def write_kw_and_forward_to_llm_page(self):
        self.modified_keywords = self.keywords_textarea.get('1.0', tk.END)
        self.app_data.modified_keywords = self.modified_keywords
-        #print(self.app_data.modified_keywords)
        keywords = f"Transcription keywords:\n\n{self.modified_keywords}\n"
        filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
        with open(filename, 'r') as file:
@ -50,9 +45,10 @@ class KeywordsPage(tk.Frame):
        self.app.show_frame(LlmPage)


+
    def start_kw_extraction_process(self, transcription_text):
-        # Save the transcription to a text file
-        if (self.app_data.lecture_filename == None):
+        if (self.app_data.lecture_filename == None):     
+            
            date_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.app_data.lecture_filename = f"lecture_{date_time_str}"
            filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
@ -60,9 +56,11 @@ class KeywordsPage(tk.Frame):
        else:
            filename = f"recordings/transcript_{self.app_data.lecture_filename}.txt"
            self.lecture_filename_label.config(text=filename)
-        
+        extracted_texts = extract_text_in_asterisks(transcription_text)
+        highlighted_transcription = extracted_texts + "\n\n" + transcription_text
+        self.app_data.highlights = extracted_texts
        with open(filename, "w") as file:
-            file.write(transcription_text)
+            file.write(highlighted_transcription)

        # Extract the keywords
        keywords = self.extract_topics(transcription_text)
@ -72,13 +70,8 @@ class KeywordsPage(tk.Frame):
    def extract_topics(self, transcript):
        """Lemmatizing words into their simplest form"""
        lemmatizer = WordNetLemmatizer()
-
-        # Split transcript into sentences
        sentences = re.split(r'[.!?]', transcript)
-
-        # Initialize list to store lemmatized data
        cleaned_data = []
-
        # Preprocess and lemmatize each sentence
        for sentence in sentences:
            # Preprocess the sentence
@ -91,55 +84,22 @@ class KeywordsPage(tk.Frame):
            cleaned_data.append(lemmatized_sentence)


-        """Setting tf-idf and NMF variables"""
+        """Setting tf-idf variables"""
        n_samples = len(cleaned_data)
        n_features = 20
-        n_components = 10
-        n_top_words = 10
-        batch_size = 128
-        init = "nndsvda"

-
-        """Use tf-idf features for NMF"""
        data_samples = cleaned_data[:n_samples]
        tfidf_vectorizer = TfidfVectorizer(
-            max_df=0.30,
-            min_df=2,
-            max_features=n_features,
-            stop_words="english"
+        max_df=0.30,
+        min_df=2,
+        max_features=n_features,
+        stop_words="english"
        )
-        
        tfidf = tfidf_vectorizer.fit_transform(data_samples)
-        
        tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
-        
-        #print("TF-IDF Feature names: ", tfidf_feature_names)
-
-
-#        nmf = NMF(
-#            n_components=n_components,
-#            max_iter=n_samples,
-#            #tol=1e-4,
-#            random_state=1,
-#            init=init,
-#            beta_loss="frobenius",
-#            alpha_W=0.00005, #directory theres run command dash, file way root theres echo, hello echo example say run, just run say things dash, know things command say theres, like things example theres way, program run echo way say, shell run things way command, use way things dash command, want root say example command
-#            alpha_H=0.00005,
-#            #alpha_W=0, # directory theres run command dash, file way root theres echo, hello echo example say run, just run say things dash, know things command say theres, like things example theres way, program run echo way say, shell run things way command, use way things dash command, want root say example command
-#            #alpha_H=0,
-#            l1_ratio=1,
-#        ).fit(tfidf)
-
-        topics_list = []
-
-        # Collect the top words for each topic
-        #for topic_idx, topic in enumerate(nmf.components_):
-            # Get the top 5 words for this topic
-        #    top_words = [tfidf_feature_names[i] for i in topic.argsort()[:-5 - 1:-1]]
-            # Convert the list of top words to a string and add to the topics list
-        #    topics_list.append(" ".join(top_words))
-
-        topics = set(topics_list)  # Naive splitting by spaces
-        #return sorted(topics)  # Return a sorted list of unique words
-        return sorted(tfidf_feature_names)  # Return a sorted list of unique words
+        return sorted(tfidf_feature_names) 

+def extract_text_in_asterisks(text):
+    pattern = r'\*\*(.*?)\*\*'  
+    matches = re.findall(pattern, text)
+    return ",\n".join(matches)
--- a/lsb_package/llm_page.py
+++ b/lsb_package/llm_page.py
@ -3,6 +3,7 @@ from llama_cpp import Llama
 import tkinter as tk
 from tkinter import ttk
 import threading
+from tkinter import filedialog

 class LlmPage(tk.Frame):
    def __init__(self, parent, app_data):
@ -10,74 +11,43 @@ class LlmPage(tk.Frame):
        self.app = parent
        self.app_data = app_data
        tk.Label(self, text="Notes", font=self.app_data.heading_font).grid(row=0, column=0, sticky="ew", pady=2, padx=2)
-        tk.Label(self, text="Press the generate button and wait for your notes to generate.", font=self.app_data.paragraph_font, wraplength=400, justify="left").grid(row=1, column=0, sticky="ew", pady=2, padx=2)
+        tk.Label(self, text="Press the button bellow to select your LLM and generate notes according to the previously defined keywods (tested on gpt4all-falcon-newbpe-q4_0.gguf) other model output object might not match the display function", font=self.app_data.paragraph_font, wraplength=800, justify="left").grid(row=1, column=0, sticky="ew", pady=2, padx=2)
+
+
        self.text_widget = tk.Text(self, font=self.app_data.paragraph_font, wrap="word")
-        self.text_widget.grid(row=2, column=0, sticky="nsew", pady=6, padx=6)
+        self.text_widget.grid(row=3, column=0, sticky="nsew", pady=6, padx=6)

-        self.start_button = ttk.Button(self, text="Start Operation", command=self.start_llama_operation)
-        self.start_button.grid(row=3, column=0, sticky="ew", pady=2, padx=2)
+        self.start_button = ttk.Button(self, text="Select LLM and generate notes", command=self.start_llama_thread)
+        self.start_button.grid(row=4, column=0, pady=2, padx=2)

-    def start_llama_operation(self):
+        self.save_button = tk.Button(self, text="Save Notes", command=self.save_notes)
+    # LLM MODEL FILE SELECTOR
+    def browse_file(self):
+        llm_model_filename = filedialog.askopenfilename(initialdir="./", title="Select your LLM (GGUF format)", filetypes=(("LLM binary file", "*.gguf"), ("All files", "*.*")))
+        if llm_model_filename:
+            return llm_model_filename
+
+    def start_llama_thread(self):
        if self.app_data.modified_keywords is None:
            self.text_widget.insert(tk.END, "Keywords have not been set.")
        else:
            self.text_widget.delete('1.0', tk.END)
-            self.text_widget.insert(tk.END, "Please wait...")
+            self.text_widget.insert(tk.END, "Generating, Please wait...")

-            operation_thread = threading.Thread(target=self.run_llama_operation, args=(self.app_data.modified_keywords,))  # Pass data explicitly
+            operation_thread = threading.Thread(target=self.run_llama_operation, args=(self.app_data.modified_keywords,self.app_data.highlights,))  # Pass data explicitly
            operation_thread.start()

-    def run_llama_operation(self, llmTopics):
+    def run_llama_operation(self, llmTopics, highlights):
        try:
-            # Example: Llama class must be imported correctly here
-            llm = Llama(model_path="##############", n_ctx=2048, )
-            # output = llm.create_chat_completion(
-            #     messages=[
-            #         {"role": "system", "content": "You are a teacher explaining in great detail given topics divided by new line."},
-            #         {"role": "user", "content": llmTopics}  # Use local variable passed to thread
-            #     ]
-            # )
+            
+            llm = Llama(model_path=self.browse_file(), n_ctx=2048,)
            output = llm(
-                f"Genereate comprehensive, informative and factual descriptions for the provided keywords '{llmTopics}", # Prompt
-                max_tokens=0, # Generate up to 32 tokens, set to None to generate up to the end of the context window
-
+                prompt=f"Genereate comprehensive, informative and factual descriptions for the provided keywords '{llmTopics}'.", # Prompt
+                max_tokens=0,
            )
-            """Generate text from a prompt.
-            Args:
-                prompt: The prompt to generate text from.
-                suffix: A suffix to append to the generated text. If None, no suffix is appended.
-                max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-                temperature: The temperature to use for sampling.
-                top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-                min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-                typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-                logprobs: The number of logprobs to return. If None, no logprobs are returned.
-                echo: Whether to echo the prompt.
-                stop: A list of strings to stop generation when encountered.
-                frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
-                presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
-                repeat_penalty: The penalty to apply to repeated tokens.
-                top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-                stream: Whether to stream the results.
-                seed: The seed to use for sampling.
-                tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-                mirostat_mode: The mirostat sampling mode.
-                mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-                mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-                model: The name to use for the model in the completion object.
-                stopping_criteria: A list of stopping criteria to use.
-                logits_processor: A list of logits processors to use.
-                grammar: A grammar to use for constrained sampling.
-                logit_bias: A logit bias to use.
-
-            Raises:
-                ValueError: If the requested tokens exceed the context window.
-                RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
-
-            Returns:
-                Response object containing the generated text.
-            """
            self.text_widget.after(0, self.update_text_widget, output['choices'][0])
+            self.save_button.grid(row=5, column=0, pady=2, padx=2)
+
        except Exception as e:
            print(f"Error during Llama operation: {e}")
            self.text_widget.after(0, self.update_text_widget, "An error occurred, please try again.")
@ -86,3 +56,10 @@ class LlmPage(tk.Frame):
        if self.winfo_exists():
            self.text_widget.delete('1.0', tk.END)
            self.text_widget.insert(tk.END, content)
+
+    def save_notes(self):
+        text = self.text_widget.get("1.0", "end-1c")  # Get all text from the textarea
+        filename = f"recordings/notes_{self.app_data.lecture_filename}.txt"
+        with open(filename, "w") as file:
+            file.write(text)
+
--- a/lsb_package/start_page.py
+++ b/lsb_package/start_page.py
@ -1,5 +1,6 @@
 import tkinter as tk
 from tkinter import ttk
+import webbrowser
 from datetime import datetime
 import sounddevice as sd
 import soundfile as sf
@ -24,7 +25,6 @@ class StartPage(tk.Frame):
        all_devices = sd.query_devices()
        input_devices = {all_devices[i]['name']: i for i in range(len(all_devices)) if all_devices[i]['max_input_channels'] > 0}

-        # Dropdown for device selection
        self.device_var = tk.StringVar()
        device_names = list(input_devices.keys())
        self.device_menu = ttk.Combobox(self, values=device_names, textvariable=self.device_var)
@ -35,11 +35,23 @@ class StartPage(tk.Frame):
        ttk.Button(self, text="Start Recording", command=self.start_recording).pack(pady=5)
        ttk.Button(self, text="Stop Recording", command=self.stop_recording).pack(pady=5)
        ttk.Button(self, text="Skip Recording", command=self.skip_recording_page).pack(pady=5)
-
+    
        # Recording indicator
        self.recording_indicator = tk.Label(self, text="Recording: OFF", fg="red")
        self.recording_indicator.pack(pady=5)

+
+        def open_license_dialog():
+            webbrowser.open("https://www.gnu.org/licenses/gpl.html")
+        label_link_license = tk.Label(self, text="click here for details. https://www.gnu.org/licenses/gpl.html", fg="blue", cursor="hand2")
+        label_link_license.pack(side="bottom")
+        label_link_license.bind("<Button-1>", lambda event: open_license_dialog())
+        label_license = tk.Label(self, text="Lecture Summarizer Copyright (C) 2024 Martin Jaros\nThis program comes with ABSOLUTELY NO WARRANTY;\nThis is free software, and you are welcome to redistribute it under certain conditions;")
+        label_license.pack(side="bottom")
+
+
+
+
    def start_recording(self):
        if not self.recording:
            self.recording = True
@ -79,7 +91,6 @@ class StartPage(tk.Frame):


    def update_recording_indicator(self, is_recording):
-        #"""Update the recording indicator based on the recording state."""
        if is_recording:
            self.recording_indicator.config(text="Recording: ON", fg="green")
        else:
--- a/lsb_package/transcription_page.py
+++ b/lsb_package/transcription_page.py
@ -59,12 +59,10 @@ class TranscriptionPage(tk.Frame):
        query = self.search_box.get()
        if not query:
            return
-
        # Starting position for the search (insert cursor position)
        start_pos = self.transcription_textarea.index(tk.INSERT)
        # Search for the query in the text area
        pos = self.transcription_textarea.search(query, start_pos, tk.END)
-
        if pos:
            # If found, move cursor to the start of the found text and select the text
            end_pos = f"{pos}+{len(query)}c"  # Calculate end position of the selection
@ -72,13 +70,11 @@ class TranscriptionPage(tk.Frame):
            self.transcription_textarea.tag_add(tk.SEL, pos, end_pos)
            self.transcription_textarea.mark_set(tk.INSERT, pos)
            self.transcription_textarea.see(pos)
-
        # Hide the search box after search
        self.search_box.grid_remove()
        self.search_tooltip.grid_remove()


-
    def insert_into_textarea(self, transcription):
        """Insert transcription text into the Text widget."""
        def update_text():
@ -107,18 +103,14 @@ class TranscriptionPage(tk.Frame):
            whisper_model = whisper.load_model("small", device=hw_device)
            device_label = tk.Label(self, text="Loaded Whisper on: " + hw_device, font=self.app_data.mono_font)
            device_label.grid(row=2, column=0, pady=4)
-
            transcription_text = whisper_model.transcribe(audio_filepath)
            self.insert_into_textarea(transcription_text['text'])
-
-            
            # Collect garbage to free up memory (doesn't seem to work)
            del whisper_model
            if (hw_device=='cuda'):
                torch.cuda.empty_cache()
            gc.collect()  

-
        except Exception as e:
            print(f"Error during transcription: {e}")
            self.insert_into_textarea("Failed to transcribe audio.")
--- a/main.py
+++ b/main.py
@ -6,6 +6,7 @@ class AppData:
    def __init__(self):
        self._lecture_filename = None
        self._modified_keywords = None
+    highlights = None

    @property
    def lecture_filename(self):
@ -22,16 +23,12 @@ class AppData:
    @modified_keywords.setter
    def modified_keywords(self, value):
        self._modified_keywords = value
-    
+
    heading_font = ("DejaVu Sans", 20, "bold")
    paragraph_font = ("DejaVu Sans", 12)
    serif_font = ("DejaVu Serif", 12)
    mono_font = ("Courier", 11)

-# class AppData:
-#     lecture_filename = None
-#     modified_keywords = None
-

 class App(tk.Tk):
    def __init__(self):