Is it possible to use Snowflake as a vector store?

10/17/2023

Mike Ritchie

tl;dr - Yes, it's possible, but it's really slow (at least this way of doing it is).

Snowflake as a vector store

Here's the steps to pull this off:

  1. Add and "embeddings" column to your table in Snowflake with type ARRAY
  2. Use the OpenAI API to generate embeddings for each row
  3. Insert the embeddings into Snowflake
  4. Query the embeddings using a Python UDF we'll call vector_cosine_similarity_vectorized

Loading data to Snowflake

If you don't already have a dataset you want to use, this one from Kaggle is good.

Create a table in Snowflake:

CREATE TABLE JEOPARDY_EMBEDDINGS (
    show_number INT,
    air_date STRING,
    round STRING,
    category STRING,
    value STRING,
    question STRING,
    answer STRING,
    embedding ARRAY
);

Get embeddings for each row using the OpenAI API (we're only embedding the question column, but you could easily mash the columns together):

import requests
import json
import pandas as pd
import openai
openai.api_key = OPENAI_API_KEY

rows = jep_df.to_dict(orient='records')

for row in rows:
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=[row['question']],
    )

    row['embedding'] = response["data"][0]["embedding"]

Insert the embeddings into Snowflake:

parquet_file = 'temp_dataframe.parquet'
load_to_snow.to_parquet(parquet_file, index=False)

conn = snowflake.connector.connect(
    account=account,
    user=user,
    password=password,
    database=database,
    schema=schema,
    warehouse=warehouse,
    role=role
)

cur = conn.cursor()

# Create or use an existing internal stage
stage_name = 'TEMP_INTERNAL_STAGE'
cur.execute(f"CREATE OR REPLACE STAGE {stage_name}")

# Upload the Parquet file to the stage
cur.execute(f"PUT file://{parquet_file} @{stage_name}")

# Bulk load the data from the stage to a Snowflake table
table_name = 'JEOPARDY_EMBEDDINGS'
cur.execute(f"COPY INTO {table_name} FROM @{stage_name} FILE_FORMAT=(TYPE='PARQUET')")

cur.close()
conn.close()

Querying the embeddings

Create a Python UDF called vector_cosine_similarity_vectorized:

CREATE OR REPLACE FUNCTION vector_cosine_similarity_vectorized(A ARRAY, B ARRAY)
RETURNS FLOAT
LANGUAGE PYTHON
RUNTIME_VERSION = 3.8
PACKAGES = ('pandas', 'numpy')
HANDLER = 'vector_cosine_similarity_vectorized'
AS $$
import pandas as pd
import numpy as np
from _snowflake import vectorized

@vectorized(input=pd.DataFrame)
def vector_cosine_similarity_vectorized(df):
    # Extract first and second columns
    A_series = df.iloc[:, 0].apply(np.array)
    B_series = df.iloc[:, 1].apply(np.array)

    # Compute cosine similarity for each pair of vectors
    dot_product = A_series.apply(lambda a: np.dot(a, B_series.loc[A_series.index[0]]))
    norm_A = A_series.apply(np.linalg.norm)
    norm_B = B_series.apply(np.linalg.norm)

    similarity = dot_product / (norm_A * norm_B)
    return similarity
$$;

Query the embeddings:

query_sentence = '''what island had a really big monkey?'''

# create an embedding for the query sentence
response = openai.Embedding.create(
    model="text-embedding-ada-002",
    input=[query_sentence]
)
query_embedding = response["data"][0]["embedding"]

conn = snowflake.connector.connect(
    account=account,
    user=user,
    password=password,
    database=database,
    schema=schema,
    warehouse=warehouse,
    role=role
)

cur = conn.cursor()

search_vec = str(query_embedding)
search_vec = search_vec.replace('[', '').replace(']', '')

sql = f'''
SELECT 
    j.*,
    vector_cosine_similarity_vectorized(ARRAY_CONSTRUCT({search_vec}), j.EMBEDDING) as similarity
FROM 
    public.JEOPARDY_EMBEDDINGS as j
order by similarity desc
LIMIT 10;
'''
cur.execute(sql)

rows = cur.fetchall()
columns = [col[0] for col in cur.description]
df = pd.DataFrame(rows, columns=columns)
cur.close()
conn.close()
df.head()

Data doesn’t need to be so hard

Get the new standard in analytics. Sign up below or get in touch and we’ll set you up in under 30 minutes.