{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# (abandoned) Creating RAG from scratch\n", "\n", "\n", "https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Postgresql with pgvector dockerfile\n", "```\n", "version: \"3.6\"\n", "\n", "services:\n", " postgres-pgvector:\n", " image: ankane/pgvector:latest\n", " container_name: postgres-pgvector\n", " restart: always\n", " volumes:\n", " - postgres_pgvector_volume:/var/lib/postgresql/data\n", " ports:\n", " - 5432:5432\n", " environment:\n", " POSTGRES_USER: user\n", " POSTGRES_PASSWORD: password\n", "volumes:\n", " postgres_pgvector_volume:\n", " ```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import psycopg2\n", "\n", "db_name = \"vector_db\"\n", "host = \"localhost\"\n", "password = \"password\"\n", "port = \"5432\"\n", "user = \"user\"\n", "# conn = psycopg2.connect(connection_string)\n", "conn = psycopg2.connect(\n", " dbname='postgres',\n", " host=host,\n", " password=password,\n", " port=port,\n", " user=user,\n", ")\n", "conn.autocommit = True\n", "\n", "with conn.cursor() as c:\n", " c.execute(f\"DROP DATABASE IF EXISTS {db_name}\")\n", " c.execute(f\"CREATE DATABASE {db_name}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sqlalchemy import make_url\n", "from llama_index.vector_stores.postgres import PGVectorStore\n", "\n", "vector_store = PGVectorStore.from_params(\n", " database=db_name,\n", " host=host,\n", " password=password,\n", " port=port,\n", " user=user,\n", " table_name=\"transcription\",\n", " embed_dim=384, # openai embedding dimension\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Use a Text Splitter to Split Documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.core.node_parser import SentenceSplitter\n", "\n", "text_parser = SentenceSplitter(\n", " chunk_size=1024,\n", " # separator=\" \",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.core import SimpleDirectoryReader\n", "\n", "reader = SimpleDirectoryReader(input_dir=\"./txt_files/\")\n", "documents = reader.load_data()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_chunks = []\n", "# maintain relationship with source doc index, to help inject doc metadata in (3)\n", "doc_idxs = []\n", "for doc_idx, doc in enumerate(documents):\n", " cur_text_chunks = text_parser.split_text(doc.text)\n", " text_chunks.extend(cur_text_chunks)\n", " doc_idxs.extend([doc_idx] * len(cur_text_chunks))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.core.schema import TextNode\n", "\n", "nodes = []\n", "for idx, text_chunk in enumerate(text_chunks):\n", " node = TextNode(\n", " text=text_chunk,\n", " )\n", " src_doc = documents[doc_idxs[idx]]\n", " node.metadata = src_doc.metadata\n", " print(node)\n", " nodes.append(node)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sentence transformers\n", "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "\n", "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for node in nodes:\n", " node_embedding = embed_model.get_text_embedding(\n", " node.get_content(metadata_mode=\"all\")\n", " )\n", " node.embedding = node_embedding\n", " print(node_embedding)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query_str = \"Bash\"\n", "query_embedding = embed_model.get_query_embedding(query_str)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query_embedding" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# construct vector store query\n", "from llama_index.core.vector_stores import VectorStoreQuery\n", "\n", "query_mode = \"default\"\n", "# query_mode = \"sparse\"\n", "#uery_mode = \"hybrid\"\n", "\n", "vector_store_query = VectorStoreQuery(\n", " query_embedding=query_embedding, similarity_top_k=10, mode=query_mode\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vector_store_query" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# returns a VectorStoreQueryResult\n", "query_result = vector_store.query(vector_store_query)\n", "print(query_result)" ] } ], "metadata": { "kernelspec": { "display_name": "whispertest-WbvfCJ5O", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }