Discussions
I'm getting different embeddings for the SAME encoding
1 day ago by Steve James
Here is my code
import { System } from '@iteri/back-core'
interface VoyageEmbedding {
object: string
data: Array<{
object: string
embedding: number[]
index: number
}>
model: string
usage: {
total_tokens: number
}
}
export async function compareEmbeddings(
system: System,
concept1: string,
concept2: string,
useFetch = false
) {
const apiKey = 'pa-lk5rLjy3jMgTsAxbeGXvh9vubAwmsEi1gdCLihw-Gvs'
try {
let embedding1: number[]
let embedding2: number[]
// if (useFetch) {
// Get embeddings from VoyageAI
const response = await fetch('https://api.voyageai.com/v1/embeddings', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
'X-Fern-Language': 'JavaScript',
'X-Fern-SDK-Name': 'voyageai',
'X-Fern-SDK-Version': '0.0.4',
'User-Agent': 'voyageai/0.0.4',
},
body: JSON.stringify({
input: [concept1, concept2],
model: 'voyage-3-large',
}),
})
const response2 = await fetch('https://api.voyageai.com/v1/embeddings', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
'X-Fern-Language': 'JavaScript',
'X-Fern-SDK-Name': 'voyageai',
'X-Fern-SDK-Version': '0.0.4',
'User-Agent': 'voyageai/0.0.4',
},
body: JSON.stringify({
input: concept1,
model: 'voyage-3-large',
}),
})
const result: VoyageEmbedding = await response.json()
const result2: VoyageEmbedding = await response2.json()
embedding1 = result.data[0].embedding
embedding2 = result2.data[0].embedding
// Calculate cosine similarity
const dotProduct = embedding1.reduce((sum, a, i) => sum + a * embedding2[i], 0)
const magnitude1 = Math.sqrt(embedding1.reduce((sum, a) => sum + a * a, 0))
const magnitude2 = Math.sqrt(embedding2.reduce((sum, a) => sum + a * a, 0))
const cosineSimilarity = dotProduct / (magnitude1 * magnitude2)
const cosineDistance = 1 - cosineSimilarity
console.log(
`"${concept1},${concept2}" vs raw "${concept1}": ${cosineSimilarity.toFixed(6)}`
)
// console.log(`Cosine distance: ${cosineDistance.toFixed(6)}`)
return { similarity: cosineSimilarity, distance: cosineDistance }
} catch (error) {
console.error('Error:', error)
return null
}
}
If I run it a few times:-
await compareEmbeddings(system, 'modern', 'music', true)
await compareEmbeddings(system, 'modern', 'art', true)
await compareEmbeddings(system, 'modern', 'intelligence', true)
Here are (a select set of) results:
"modern,music" vs raw "modern": 0.992408
"modern,art" vs raw "modern": 0.992408
"modern,intelligence" vs raw "modern": 0.992408
"modern,music" vs raw "modern": 0.992408
"modern,art" vs raw "modern": 0.992408
"modern,intelligence" vs raw "modern": 0.993093 // DIFFERENT
"modern,music" vs raw "modern": 0.992408
"modern,art" vs raw "modern": 0.993137 // DIFFERENT
"modern,intelligence" vs raw "modern": 0.992408
You CONSISTANCY get a different embedding for the word "modern" if it is requested as part of a array of inputs
You OCCASSIONALLY get a different embedding for the word "modern" regardless of the array/string differences
...
What am I misunderstanding here?