# (abandoned) Creating RAG from scratch


https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval.html

### Postgresql with pgvector dockerfile
```
version: "3.6"

services:
  postgres-pgvector:
    image: ankane/pgvector:latest
    container_name: postgres-pgvector
    restart: always
    volumes:
      - postgres_pgvector_volume:/var/lib/postgresql/data
    ports:
      - 5432:5432
    environment:
      POSTGRES_USER: user
      POSTGRES_PASSWORD: password
volumes:
  postgres_pgvector_volume:
  ```

In [None]:
import psycopg2

db_name = "vector_db"
host = "localhost"
password = "password"
port = "5432"
user = "user"
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname='postgres',
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [None]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="transcription",
    embed_dim=384,  # openai embedding dimension
)

### Use a Text Splitter to Split Documents

In [None]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [None]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir="./txt_files/")
documents = reader.load_data()

In [None]:
documents

In [None]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [None]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    print(node)
    nodes.append(node)

In [None]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


In [None]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding
    print(node_embedding)


In [None]:
query_str = "Bash"
query_embedding = embed_model.get_query_embedding(query_str)

In [None]:
query_embedding

In [None]:
# construct vector store query
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
#uery_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=10, mode=query_mode
)

In [None]:
vector_store_query

In [None]:
# returns a VectorStoreQueryResult
query_result = vector_store.query(vector_store_query)
print(query_result)