Cookie Consent by Free Privacy Policy Generator

Embeddings for Redirect Mapping using Ollama (Llama3 embeddings) with fuzzy matching

Code:
import pandas as pd
import ollama
from rapidfuzz import fuzz
import logging

# -------------------------------
# Fine-tuning parameters
EMBEDDING_THRESHOLD = 0.7  # Adjust for stricter or looser embedding matches
FUZZY_THRESHOLD = 0.6  # Adjust for stricter or looser fuzzy matches
# -------------------------------

# Set up logging (logs both to file & PowerShell console)
log_file = "mapping_log.txt"
logging.basicConfig(filename=log_file, level=logging.INFO, format="%(asctime)s - %(message)s")

def log_message(message):
    """Log message to both console (PowerShell) and file."""
    print(message)  # Print in real-time to PowerShell
    logging.info(message)  # Also save to log file

def generate_embedding(text):
    """Generate an embedding using Ollama (Llama3)."""
    try:
        response = ollama.embeddings(model="llama3", prompt=text)
        return response["embedding"]
    except Exception as e:
        log_message(f"[ERROR] Failed to generate embedding for '{text}': {e}")
        return None

def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two embeddings."""
    if vec1 is None or vec2 is None:
        return 0
    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    magnitude1 = sum(a**2 for a in vec1) ** 0.5
    magnitude2 = sum(b**2 for b in vec2) ** 0.5
    return dot_product / (magnitude1 * magnitude2) if magnitude1 and magnitude2 else 0

# Load CSV files
old_df = pd.read_csv("old.csv", names=["URL"])
new_df = pd.read_csv("new.csv", names=["URL"])

# Generate embeddings for all new URLs (cache for efficiency)
log_message("Generating embeddings for new URLs...")
new_embeddings = {row["URL"]: generate_embedding(row["URL"]) for _, row in new_df.iterrows()}
log_message("Embeddings for new URLs completed.")

# Create output list
output_data = []

log_message("Starting URL mapping...")
for index, old_row in old_df.iterrows():
    old_url = old_row["URL"]
    old_embedding = generate_embedding(old_url)

    best_match = None
    best_score = 0
    match_method = "No close match"

    # Compare against new URLs using embeddings
    for new_url, new_embedding in new_embeddings.items():
        similarity = cosine_similarity(old_embedding, new_embedding)
        if similarity > best_score and similarity >= EMBEDDING_THRESHOLD:
            best_match = new_url
            best_score = similarity
            match_method = "Embeddings"

    # If no good embedding match, use fuzzy matching
    if best_score < EMBEDDING_THRESHOLD:
        for new_url in new_df["URL"]:
            fuzzy_score = fuzz.ratio(old_url, new_url) / 100
            if fuzzy_score > best_score and fuzzy_score >= FUZZY_THRESHOLD:
                best_match = new_url
                best_score = fuzzy_score
                match_method = "Fuzzy"

    # If no match, return "No close match"
    if not best_match:
        best_match = "No close match"
        best_score = 0

    log_message(f"Processed {index+1}/{len(old_df)}: {old_url} → {best_match} (Score: {best_score:.2f}, Method: {match_method})")

    # Store result
    output_data.append([old_url, best_match, best_score, match_method])

# Save results to CSV
output_df = pd.DataFrame(output_data, columns=["Old URL", "New URL", "Match Score", "Match Method"])
output_df.to_csv("redirect_mapping.csv", index=False)

log_message("Processing complete. Check 'redirect_mapping.csv' for results.")
 
Back
Top