From 2330a8cd7955cacf4aa9b87178dbe9ea6657ee1e Mon Sep 17 00:00:00 2001 From: Mingchung Xia <mingchung.xia@gmail.com> Date: Thu, 14 Mar 2024 15:55:35 -0400 Subject: [PATCH] Added cosine similarity --- .../HNSW/CosineSimilarityMetric.swift | 39 +++++++++++++++++++ .../DeterministicDurableVectorIndex.swift | 6 ++- .../SwiftNLPTests/2. Encoding/HNSWTests.swift | 22 +++++------ 3 files changed, 54 insertions(+), 13 deletions(-) create mode 100644 Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift new file mode 100644 index 00000000..23af62e3 --- /dev/null +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift @@ -0,0 +1,39 @@ +// +// CosineSimilarityMetric.swift +// +// +// Created by Mingchung Xia on 2024-03-14. +// + +import Foundation +import Accelerate +import SimilarityMetric + +// MARK: May be improved on using Surge/Nifty +// See https://developer.apple.com/documentation/accelerate/vdsp-snv + +public struct CosineSimilarityMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint { + public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element { + /// Convert vectors to Double for Accelerate functions + let someItemDoubles = someItem.map { Double($0) } + let otherItemDoubles = otherItem.map { Double($0) } + + /// Calculate dot product + var dotProduct: Double = 0.0 + vDSP_dotprD(someItemDoubles, 1, otherItemDoubles, 1, &dotProduct, vDSP_Length(someItemDoubles.count)) + + /// Calculate magnitude of vectors + var someItemMagnitudeSquared: Double = 0.0 + var otherItemMagnitudeSquared: Double = 0.0 + vDSP_svesqD(someItemDoubles, 1, &someItemMagnitudeSquared, vDSP_Length(someItemDoubles.count)) + vDSP_svesqD(otherItemDoubles, 1, &otherItemMagnitudeSquared, vDSP_Length(otherItemDoubles.count)) + let someItemMagnitude = sqrt(someItemMagnitudeSquared) + let otherItemMagnitude = sqrt(otherItemMagnitudeSquared) + + /// Calculate the cosine similarity + let cosineSimilarity = dotProduct / (someItemMagnitude * otherItemMagnitude) + + /// Convert back to type Vector.Element + return Vector.Element(cosineSimilarity) + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift index ab2ca1e5..3aff0a31 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift @@ -41,7 +41,8 @@ extension DurableVectorIndex { public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint*/ { public typealias Vector = [Double] - public typealias Index = DurableVectorIndex<CartesianDistanceMetric<Vector>, Vector.Element> +// public typealias Index = DurableVectorIndex<CartesianDistanceMetric<Vector>, Vector.Element> + public typealias Index = DurableVectorIndex<CosineSimilarityMetric<Vector>, Vector.Element> public var base: Index public var typicalNeighborhoodSize: Int public var size: Int = 0 @@ -50,7 +51,8 @@ public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> wh // private var drng = DeterministicRandomNumberGenerator(seed: 1) public init(namespace: String, typicalNeighborhoodSize: Int = 20, in transaction: Transaction) throws { - let metric = CartesianDistanceMetric<Vector>() +// let metric = CartesianDistanceMetric<Vector>() + let metric = CosineSimilarityMetric<Vector>() let config = Config.unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize) self.base = try Index( namespace: namespace, diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index 1c4f9f2d..aa221933 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -283,17 +283,17 @@ final class HNSWTests: XCTestCase { let transaction = try Transaction.begin(.write, in: env) /// Saving the memory map to disk -// var corpus = try DurableHNSWCorpus( -// encoder: _documentEncoder, -// namespace: "testbasicqueryexampledurable", -// in: transaction -// ) -// -// for doc in docs { -// try corpus.addUntokenizedDocument(doc, in: transaction) -// } -// -// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") + var corpus = try DurableHNSWCorpus( + encoder: _documentEncoder, + namespace: "testbasicqueryexampledurable", + in: transaction + ) + + for doc in docs { + try corpus.addUntokenizedDocument(doc, in: transaction) + } + + corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") try transaction.commit() -- GitLab