Discussions

Ask a Question
Back to All

I'm getting different embeddings for the SAME encoding

Here is my code

import { System } from '@iteri/back-core'

interface VoyageEmbedding {
	object: string
	data: Array<{
		object: string
		embedding: number[]
		index: number
	}>
	model: string
	usage: {
		total_tokens: number
	}
}

export async function compareEmbeddings(
	system: System,
	concept1: string,
	concept2: string,
	useFetch = false
) {
	const apiKey = 'pa-lk5rLjy3jMgTsAxbeGXvh9vubAwmsEi1gdCLihw-Gvs'

	try {
		let embedding1: number[]
		let embedding2: number[]
		// if (useFetch) {
		// Get embeddings from VoyageAI
		const response = await fetch('https://api.voyageai.com/v1/embeddings', {
			method: 'POST',
			headers: {
				'Content-Type': 'application/json',
				Authorization: `Bearer ${apiKey}`,
				'X-Fern-Language': 'JavaScript',
				'X-Fern-SDK-Name': 'voyageai',
				'X-Fern-SDK-Version': '0.0.4',
				'User-Agent': 'voyageai/0.0.4',
			},
			body: JSON.stringify({
				input: [concept1, concept2],
				model: 'voyage-3-large',
			}),
		})

		const response2 = await fetch('https://api.voyageai.com/v1/embeddings', {
			method: 'POST',
			headers: {
				'Content-Type': 'application/json',
				Authorization: `Bearer ${apiKey}`,
				'X-Fern-Language': 'JavaScript',
				'X-Fern-SDK-Name': 'voyageai',
				'X-Fern-SDK-Version': '0.0.4',
				'User-Agent': 'voyageai/0.0.4',
			},
			body: JSON.stringify({
				input: concept1,
				model: 'voyage-3-large',
			}),
		})

		const result: VoyageEmbedding = await response.json()
		const result2: VoyageEmbedding = await response2.json()

		embedding1 = result.data[0].embedding
		embedding2 = result2.data[0].embedding
		
		// Calculate cosine similarity
		const dotProduct = embedding1.reduce((sum, a, i) => sum + a * embedding2[i], 0)
		const magnitude1 = Math.sqrt(embedding1.reduce((sum, a) => sum + a * a, 0))
		const magnitude2 = Math.sqrt(embedding2.reduce((sum, a) => sum + a * a, 0))

		const cosineSimilarity = dotProduct / (magnitude1 * magnitude2)
		const cosineDistance = 1 - cosineSimilarity

		console.log(
			`"${concept1},${concept2}" vs raw "${concept1}": ${cosineSimilarity.toFixed(6)}`
		)
		// console.log(`Cosine distance: ${cosineDistance.toFixed(6)}`)

		return { similarity: cosineSimilarity, distance: cosineDistance }
	} catch (error) {
		console.error('Error:', error)
		return null
	}
}

If I run it a few times:-

await compareEmbeddings(system, 'modern', 'music', true)  
await compareEmbeddings(system, 'modern', 'art', true)  
await compareEmbeddings(system, 'modern', 'intelligence', true)

Here are (a select set of) results:

"modern,music" vs raw "modern": 0.992408
"modern,art" vs raw "modern": 0.992408
"modern,intelligence" vs raw "modern": 0.992408

"modern,music" vs raw "modern": 0.992408
"modern,art" vs raw "modern": 0.992408
"modern,intelligence" vs raw "modern": 0.993093 // DIFFERENT

"modern,music" vs raw "modern": 0.992408
"modern,art" vs raw "modern": 0.993137 // DIFFERENT
"modern,intelligence" vs raw "modern": 0.992408

You CONSISTANCY get a different embedding for the word "modern" if it is requested as part of a array of inputs

You OCCASSIONALLY get a different embedding for the word "modern" regardless of the array/string differences

...

What am I misunderstanding here?