LectureSummarizer/RAG.ipynb
Martin Jaros 340389f4f4 cleanup
2024-05-03 19:34:21 +02:00

267 lines
6.2 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# (abandoned) Creating RAG from scratch\n",
"\n",
"\n",
"https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Postgresql with pgvector dockerfile\n",
"```\n",
"version: \"3.6\"\n",
"\n",
"services:\n",
" postgres-pgvector:\n",
" image: ankane/pgvector:latest\n",
" container_name: postgres-pgvector\n",
" restart: always\n",
" volumes:\n",
" - postgres_pgvector_volume:/var/lib/postgresql/data\n",
" ports:\n",
" - 5432:5432\n",
" environment:\n",
" POSTGRES_USER: user\n",
" POSTGRES_PASSWORD: password\n",
"volumes:\n",
" postgres_pgvector_volume:\n",
" ```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import psycopg2\n",
"\n",
"db_name = \"vector_db\"\n",
"host = \"localhost\"\n",
"password = \"password\"\n",
"port = \"5432\"\n",
"user = \"user\"\n",
"# conn = psycopg2.connect(connection_string)\n",
"conn = psycopg2.connect(\n",
" dbname='postgres',\n",
" host=host,\n",
" password=password,\n",
" port=port,\n",
" user=user,\n",
")\n",
"conn.autocommit = True\n",
"\n",
"with conn.cursor() as c:\n",
" c.execute(f\"DROP DATABASE IF EXISTS {db_name}\")\n",
" c.execute(f\"CREATE DATABASE {db_name}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sqlalchemy import make_url\n",
"from llama_index.vector_stores.postgres import PGVectorStore\n",
"\n",
"vector_store = PGVectorStore.from_params(\n",
" database=db_name,\n",
" host=host,\n",
" password=password,\n",
" port=port,\n",
" user=user,\n",
" table_name=\"transcription\",\n",
" embed_dim=384, # openai embedding dimension\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Use a Text Splitter to Split Documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.node_parser import SentenceSplitter\n",
"\n",
"text_parser = SentenceSplitter(\n",
" chunk_size=1024,\n",
" # separator=\" \",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import SimpleDirectoryReader\n",
"\n",
"reader = SimpleDirectoryReader(input_dir=\"./txt_files/\")\n",
"documents = reader.load_data()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text_chunks = []\n",
"# maintain relationship with source doc index, to help inject doc metadata in (3)\n",
"doc_idxs = []\n",
"for doc_idx, doc in enumerate(documents):\n",
" cur_text_chunks = text_parser.split_text(doc.text)\n",
" text_chunks.extend(cur_text_chunks)\n",
" doc_idxs.extend([doc_idx] * len(cur_text_chunks))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.schema import TextNode\n",
"\n",
"nodes = []\n",
"for idx, text_chunk in enumerate(text_chunks):\n",
" node = TextNode(\n",
" text=text_chunk,\n",
" )\n",
" src_doc = documents[doc_idxs[idx]]\n",
" node.metadata = src_doc.metadata\n",
" print(node)\n",
" nodes.append(node)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# sentence transformers\n",
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
"\n",
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for node in nodes:\n",
" node_embedding = embed_model.get_text_embedding(\n",
" node.get_content(metadata_mode=\"all\")\n",
" )\n",
" node.embedding = node_embedding\n",
" print(node_embedding)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query_str = \"Bash\"\n",
"query_embedding = embed_model.get_query_embedding(query_str)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query_embedding"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# construct vector store query\n",
"from llama_index.core.vector_stores import VectorStoreQuery\n",
"\n",
"query_mode = \"default\"\n",
"# query_mode = \"sparse\"\n",
"#uery_mode = \"hybrid\"\n",
"\n",
"vector_store_query = VectorStoreQuery(\n",
" query_embedding=query_embedding, similarity_top_k=10, mode=query_mode\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vector_store_query"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# returns a VectorStoreQueryResult\n",
"query_result = vector_store.query(vector_store_query)\n",
"print(query_result)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "whispertest-WbvfCJ5O",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}