Code:
import pandas as pd
import ollama
from rapidfuzz import fuzz
import logging
# -------------------------------
# Fine-tuning parameters
EMBEDDING_THRESHOLD = 0.7 # Adjust for stricter or looser embedding matches
FUZZY_THRESHOLD = 0.6 # Adjust for stricter or looser fuzzy matches
# -------------------------------
# Set up logging (logs both to file & PowerShell console)
log_file = "mapping_log.txt"
logging.basicConfig(filename=log_file, level=logging.INFO, format="%(asctime)s - %(message)s")
def log_message(message):
"""Log message to both console (PowerShell) and file."""
print(message) # Print in real-time to PowerShell
logging.info(message) # Also save to log file
def generate_embedding(text):
"""Generate an embedding using Ollama (Llama3)."""
try:
response = ollama.embeddings(model="llama3", prompt=text)
return response["embedding"]
except Exception as e:
log_message(f"[ERROR] Failed to generate embedding for '{text}': {e}")
return None
def cosine_similarity(vec1, vec2):
"""Calculate cosine similarity between two embeddings."""
if vec1 is None or vec2 is None:
return 0
dot_product = sum(a * b for a, b in zip(vec1, vec2))
magnitude1 = sum(a**2 for a in vec1) ** 0.5
magnitude2 = sum(b**2 for b in vec2) ** 0.5
return dot_product / (magnitude1 * magnitude2) if magnitude1 and magnitude2 else 0
# Load CSV files
old_df = pd.read_csv("old.csv", names=["URL"])
new_df = pd.read_csv("new.csv", names=["URL"])
# Generate embeddings for all new URLs (cache for efficiency)
log_message("Generating embeddings for new URLs...")
new_embeddings = {row["URL"]: generate_embedding(row["URL"]) for _, row in new_df.iterrows()}
log_message("Embeddings for new URLs completed.")
# Create output list
output_data = []
log_message("Starting URL mapping...")
for index, old_row in old_df.iterrows():
old_url = old_row["URL"]
old_embedding = generate_embedding(old_url)
best_match = None
best_score = 0
match_method = "No close match"
# Compare against new URLs using embeddings
for new_url, new_embedding in new_embeddings.items():
similarity = cosine_similarity(old_embedding, new_embedding)
if similarity > best_score and similarity >= EMBEDDING_THRESHOLD:
best_match = new_url
best_score = similarity
match_method = "Embeddings"
# If no good embedding match, use fuzzy matching
if best_score < EMBEDDING_THRESHOLD:
for new_url in new_df["URL"]:
fuzzy_score = fuzz.ratio(old_url, new_url) / 100
if fuzzy_score > best_score and fuzzy_score >= FUZZY_THRESHOLD:
best_match = new_url
best_score = fuzzy_score
match_method = "Fuzzy"
# If no match, return "No close match"
if not best_match:
best_match = "No close match"
best_score = 0
log_message(f"Processed {index+1}/{len(old_df)}: {old_url} → {best_match} (Score: {best_score:.2f}, Method: {match_method})")
# Store result
output_data.append([old_url, best_match, best_score, match_method])
# Save results to CSV
output_df = pd.DataFrame(output_data, columns=["Old URL", "New URL", "Match Score", "Match Method"])
output_df.to_csv("redirect_mapping.csv", index=False)
log_message("Processing complete. Check 'redirect_mapping.csv' for results.")