# !pip install transformers datasets faiss-cpu

from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from datasets import Dataset
import torch, faiss, numpy as np, pandas as pd

# ---------------------------------------------------------
# Sample documents representing corporate knowledge.
# ---------------------------------------------------------
documents = [
    {"title": "Company Vision", "content": "Our vision is to be the leading provider of innovative solutions in the tech industry."},
    {"title": "HR Policies", "content": "Employees are entitled to 15 days of paid leave per year. Health benefits include medical, dental, and vision coverage."},
    {"title": "Tech Stack", "content": "We use Python, Django, and React for our web applications. Our data infrastructure is built on PostgreSQL and AWS."},
    {"title": "Project Management", "content": "Our project management follows Agile principles with two-week sprints and regular retrospectives."},
]

# ---------------------------------------------------------
# Convert the documents to a Pandas DataFrame. This will be
# used to load the data into a HuggingFace Dataset(), which
# work seamlessly with HuggingFace transformer library.
# ---------------------------------------------------------
df = pd.DataFrame(documents)
# ---------------------------------------------------------
df

# ----------------------------------------------------------------
# Load the documents into a HuggingFace Dataset.
# ----------------------------------------------------------------
dataset = Dataset.from_pandas(df)
# ----------------------------------------------------------------
print(dataset)
print(dataset[0])

Dataset({
    features: ['title', 'content'],
    num_rows: 4
})
{'title': 'Company Vision', 'content': 'Our vision is to be the leading provider of innovative solutions in the tech industry.'}

# --------------------------------------------------------------
# Note: Read: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2
# Using SentenceTransformer(), there may be a way to re-implement
# code in this cell using a new and/or easier method.
# --------------------------------------------------------------

# --------------------------------------------------------------
# Load a pre-trained transformer model and tokenizer.
# --------------------------------------------------------------
model_name = "sentence-transformers/all-MiniLM-L12-v2" # https://huggingface.co/spaces/mteb/leaderboard
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # If False, warning dialogue promot.
# --------------------------------------------------------------


# --------------------------------------------------------------
# Function to generate embeddings.
# --------------------------------------------------------------
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings[0].numpy()
# --------------------------------------------------------------

# --------------------------------------------------------------
# Generate embeddings for all documents
# --------------------------------------------------------------
embeddings = np.array([embed_text(doc['content']) for doc in dataset])
# --------------------------------------------------------------

# ---------------------------------------------------------
# Initialize the FAISS index.
# ---------------------------------------------------------
dimension = embeddings.shape[1] # embeddings.shape: [4, 384]
index = faiss.IndexFlatL2(dimension)

# ---------------------------------------------------------
# Add embeddings to the index
# ---------------------------------------------------------
index.add(embeddings)

def search_faiss(query, index, dataset):
    query_embedding = embed_text(query).reshape(1, -1) # Embed query.
    distances, indices = index.search(query_embedding, k=3) # Search index & retrieve top-3 matches.
    # Note: indices[0][0] # Below, use .item() to convert from numpy.int64 to Python int.
    best_match = dataset[indices[0][0].item()] # Get the best matching document.
    return best_match

# ---------------------------------------------------------------------
# Example query.
# ---------------------------------------------------------------------
question = "What is our company vision?"
result = search_faiss(question, index, dataset)
print(f"Question: {question}")
print(f"Best Match: {result['content']}")

Question: What is our company vision?
Best Match: Our vision is to be the leading provider of innovative solutions in the tech industry.

# -------------------------------------------------------------
# Adding more documents dynamically.
# -------------------------------------------------------------
new_documents = [
    {"title": "Customer Support", "content": "Customer support is available 24/7 via chat, email, and phone. Our support team is trained to handle a wide range of issues."},
    {"title": "Product Roadmap", "content": "Our product roadmap includes the development of AI-driven features and expansion into new markets."},
]

# -------------------------------------------------------------
# Convert to DataFrame and generate embeddings.
# -------------------------------------------------------------
new_df = pd.DataFrame(new_documents)
new_dataset = Dataset.from_pandas(new_df)
new_embeddings = np.array([embed_text(doc['content']) for doc in new_dataset])

# -------------------------------------------------------------
# Add new embeddings to the FAISS index
# -------------------------------------------------------------
index.add(new_embeddings)

# -------------------------------------------------------------
# Update the dataset with new documents
# -------------------------------------------------------------
dataset = Dataset.from_pandas(pd.concat([df, new_df]))

POC Corporate Knowledge Base with Hugging Face and FAISS¶

The illustrative `design-patterns` showcased in this POC were modified & extended to successfully service a real business use-case.¶

`ARCHITECT, AWARE-BUSINESS SYSTEMS`: Noelle Milton Vega¶

1. Installation and Setup¶

First, install the required libraries. In addition to `transformers` and `datasets`, we will install `faiss-cpu` for FAISS integration.¶

2. Importing Required Libraries¶

Import the necessary libraries.¶

3. Preparing the Corporate Documents¶

We begin by creating a small dataset of corporate documents. These documents will be `encoded` into `embeddings` that `FAISS` can `index`.¶

4. Loading the into a HuggingFace Dataset¶

We load the documents into a `HuggingFace Dataset`. This allows us to work seamlessly with its `transformers library`.¶

5. Setting Up the Transformer Model for Embedding¶

We'll use a `pre-trained sentence-transformer model` to convert (`encode`) our documents into `embeddings`. These `embeddings vectors` will be `indexed by FAISS`.¶

6. Set Up the FAISS Index¶

FAISS requires the embeddings to be indexed for efficient similarity search. We will create a FAISS index and add our document embeddings.¶

7. Querying the Knowledge Base with FAISS¶

To retrieve the most relevant documents, we `convert the employee's query into an embedding` and search the FAISS index for the closest match.¶

8. Expanding the KnowledgeBase¶

More documents can easily be added to the knowledgebase by updating the dataset and the FAISS index.¶

9. Conclusion and Next Steps¶

This notebook provided an extended implementation of a corporate knowledgebase using HuggingFace and FAISS for vector search. With FAISS, you can scale your knowledge base to handle thousands of documents efficiently.¶

Suggested Next Steps:¶

	title	content
0	Company Vision	Our vision is to be the leading provider of in...
1	HR Policies	Employees are entitled to 15 days of paid leav...
2	Tech Stack	We use Python, Django, and React for our web a...
3	Project Management	Our project management follows Agile principle...

POC Corporate Knowledge Base with Hugging Face and FAISS¶

The illustrative design-patterns showcased in this POC were modified & extended to successfully service a real business use-case.¶

ARCHITECT, AWARE-BUSINESS SYSTEMS: Noelle Milton Vega¶

1. Installation and Setup¶

First, install the required libraries. In addition to transformers and datasets, we will install faiss-cpu for FAISS integration.¶

2. Importing Required Libraries¶

Import the necessary libraries.¶

3. Preparing the Corporate Documents¶

We begin by creating a small dataset of corporate documents. These documents will be encoded into embeddings that FAISS can index.¶

4. Loading the into a HuggingFace Dataset¶

We load the documents into a HuggingFace Dataset. This allows us to work seamlessly with its transformers library.¶

5. Setting Up the Transformer Model for Embedding¶

We'll use a pre-trained sentence-transformer model to convert (encode) our documents into embeddings. These embeddings vectors will be indexed by FAISS.¶

6. Set Up the FAISS Index¶

FAISS requires the embeddings to be indexed for efficient similarity search. We will create a FAISS index and add our document embeddings.¶

7. Querying the Knowledge Base with FAISS¶

To retrieve the most relevant documents, we convert the employee's query into an embedding and search the FAISS index for the closest match.¶

8. Expanding the KnowledgeBase¶

More documents can easily be added to the knowledgebase by updating the dataset and the FAISS index.¶

9. Conclusion and Next Steps¶

This notebook provided an extended implementation of a corporate knowledgebase using HuggingFace and FAISS for vector search. With FAISS, you can scale your knowledge base to handle thousands of documents efficiently.¶

Suggested Next Steps:¶

The illustrative `design-patterns` showcased in this POC were modified & extended to successfully service a real business use-case.¶

`ARCHITECT, AWARE-BUSINESS SYSTEMS`: Noelle Milton Vega¶

First, install the required libraries. In addition to `transformers` and `datasets`, we will install `faiss-cpu` for FAISS integration.¶

We begin by creating a small dataset of corporate documents. These documents will be `encoded` into `embeddings` that `FAISS` can `index`.¶

We load the documents into a `HuggingFace Dataset`. This allows us to work seamlessly with its `transformers library`.¶

We'll use a `pre-trained sentence-transformer model` to convert (`encode`) our documents into `embeddings`. These `embeddings vectors` will be `indexed by FAISS`.¶

To retrieve the most relevant documents, we `convert the employee's query into an embedding` and search the FAISS index for the closest match.¶