267 lines
6.2 KiB
Text
267 lines
6.2 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# (abandoned) Creating RAG from scratch\n",
|
|
"\n",
|
|
"\n",
|
|
"https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval.html"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Postgresql with pgvector dockerfile\n",
|
|
"```\n",
|
|
"version: \"3.6\"\n",
|
|
"\n",
|
|
"services:\n",
|
|
" postgres-pgvector:\n",
|
|
" image: ankane/pgvector:latest\n",
|
|
" container_name: postgres-pgvector\n",
|
|
" restart: always\n",
|
|
" volumes:\n",
|
|
" - postgres_pgvector_volume:/var/lib/postgresql/data\n",
|
|
" ports:\n",
|
|
" - 5432:5432\n",
|
|
" environment:\n",
|
|
" POSTGRES_USER: user\n",
|
|
" POSTGRES_PASSWORD: password\n",
|
|
"volumes:\n",
|
|
" postgres_pgvector_volume:\n",
|
|
" ```"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import psycopg2\n",
|
|
"\n",
|
|
"db_name = \"vector_db\"\n",
|
|
"host = \"localhost\"\n",
|
|
"password = \"password\"\n",
|
|
"port = \"5432\"\n",
|
|
"user = \"user\"\n",
|
|
"# conn = psycopg2.connect(connection_string)\n",
|
|
"conn = psycopg2.connect(\n",
|
|
" dbname='postgres',\n",
|
|
" host=host,\n",
|
|
" password=password,\n",
|
|
" port=port,\n",
|
|
" user=user,\n",
|
|
")\n",
|
|
"conn.autocommit = True\n",
|
|
"\n",
|
|
"with conn.cursor() as c:\n",
|
|
" c.execute(f\"DROP DATABASE IF EXISTS {db_name}\")\n",
|
|
" c.execute(f\"CREATE DATABASE {db_name}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sqlalchemy import make_url\n",
|
|
"from llama_index.vector_stores.postgres import PGVectorStore\n",
|
|
"\n",
|
|
"vector_store = PGVectorStore.from_params(\n",
|
|
" database=db_name,\n",
|
|
" host=host,\n",
|
|
" password=password,\n",
|
|
" port=port,\n",
|
|
" user=user,\n",
|
|
" table_name=\"transcription\",\n",
|
|
" embed_dim=384, # openai embedding dimension\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Use a Text Splitter to Split Documents"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from llama_index.core.node_parser import SentenceSplitter\n",
|
|
"\n",
|
|
"text_parser = SentenceSplitter(\n",
|
|
" chunk_size=1024,\n",
|
|
" # separator=\" \",\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from llama_index.core import SimpleDirectoryReader\n",
|
|
"\n",
|
|
"reader = SimpleDirectoryReader(input_dir=\"./txt_files/\")\n",
|
|
"documents = reader.load_data()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"documents"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text_chunks = []\n",
|
|
"# maintain relationship with source doc index, to help inject doc metadata in (3)\n",
|
|
"doc_idxs = []\n",
|
|
"for doc_idx, doc in enumerate(documents):\n",
|
|
" cur_text_chunks = text_parser.split_text(doc.text)\n",
|
|
" text_chunks.extend(cur_text_chunks)\n",
|
|
" doc_idxs.extend([doc_idx] * len(cur_text_chunks))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from llama_index.core.schema import TextNode\n",
|
|
"\n",
|
|
"nodes = []\n",
|
|
"for idx, text_chunk in enumerate(text_chunks):\n",
|
|
" node = TextNode(\n",
|
|
" text=text_chunk,\n",
|
|
" )\n",
|
|
" src_doc = documents[doc_idxs[idx]]\n",
|
|
" node.metadata = src_doc.metadata\n",
|
|
" print(node)\n",
|
|
" nodes.append(node)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# sentence transformers\n",
|
|
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
|
|
"\n",
|
|
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for node in nodes:\n",
|
|
" node_embedding = embed_model.get_text_embedding(\n",
|
|
" node.get_content(metadata_mode=\"all\")\n",
|
|
" )\n",
|
|
" node.embedding = node_embedding\n",
|
|
" print(node_embedding)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"query_str = \"Bash\"\n",
|
|
"query_embedding = embed_model.get_query_embedding(query_str)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"query_embedding"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# construct vector store query\n",
|
|
"from llama_index.core.vector_stores import VectorStoreQuery\n",
|
|
"\n",
|
|
"query_mode = \"default\"\n",
|
|
"# query_mode = \"sparse\"\n",
|
|
"#uery_mode = \"hybrid\"\n",
|
|
"\n",
|
|
"vector_store_query = VectorStoreQuery(\n",
|
|
" query_embedding=query_embedding, similarity_top_k=10, mode=query_mode\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"vector_store_query"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# returns a VectorStoreQueryResult\n",
|
|
"query_result = vector_store.query(vector_store_query)\n",
|
|
"print(query_result)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "whispertest-WbvfCJ5O",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|