diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift new file mode 100644 index 0000000000000000000000000000000000000000..23af62e38b6a47353ea801683200b8af9dce1eca --- /dev/null +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift @@ -0,0 +1,39 @@ +// +// CosineSimilarityMetric.swift +// +// +// Created by Mingchung Xia on 2024-03-14. +// + +import Foundation +import Accelerate +import SimilarityMetric + +// MARK: May be improved on using Surge/Nifty +// See https://developer.apple.com/documentation/accelerate/vdsp-snv + +public struct CosineSimilarityMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint { + public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element { + /// Convert vectors to Double for Accelerate functions + let someItemDoubles = someItem.map { Double($0) } + let otherItemDoubles = otherItem.map { Double($0) } + + /// Calculate dot product + var dotProduct: Double = 0.0 + vDSP_dotprD(someItemDoubles, 1, otherItemDoubles, 1, &dotProduct, vDSP_Length(someItemDoubles.count)) + + /// Calculate magnitude of vectors + var someItemMagnitudeSquared: Double = 0.0 + var otherItemMagnitudeSquared: Double = 0.0 + vDSP_svesqD(someItemDoubles, 1, &someItemMagnitudeSquared, vDSP_Length(someItemDoubles.count)) + vDSP_svesqD(otherItemDoubles, 1, &otherItemMagnitudeSquared, vDSP_Length(otherItemDoubles.count)) + let someItemMagnitude = sqrt(someItemMagnitudeSquared) + let otherItemMagnitude = sqrt(otherItemMagnitudeSquared) + + /// Calculate the cosine similarity + let cosineSimilarity = dotProduct / (someItemMagnitude * otherItemMagnitude) + + /// Convert back to type Vector.Element + return Vector.Element(cosineSimilarity) + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift index ab2ca1e51bb2cdc98cc07806c3381941840af9f8..3aff0a31c21933c960e5dfbba6380be8dc27590f 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift @@ -41,7 +41,8 @@ extension DurableVectorIndex { public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint*/ { public typealias Vector = [Double] - public typealias Index = DurableVectorIndex<CartesianDistanceMetric<Vector>, Vector.Element> +// public typealias Index = DurableVectorIndex<CartesianDistanceMetric<Vector>, Vector.Element> + public typealias Index = DurableVectorIndex<CosineSimilarityMetric<Vector>, Vector.Element> public var base: Index public var typicalNeighborhoodSize: Int public var size: Int = 0 @@ -50,7 +51,8 @@ public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> wh // private var drng = DeterministicRandomNumberGenerator(seed: 1) public init(namespace: String, typicalNeighborhoodSize: Int = 20, in transaction: Transaction) throws { - let metric = CartesianDistanceMetric<Vector>() +// let metric = CartesianDistanceMetric<Vector>() + let metric = CosineSimilarityMetric<Vector>() let config = Config.unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize) self.base = try Index( namespace: namespace, diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index 1c4f9f2d13d325ba2fbaa91cfa893cfaa8e7c802..aa221933cc63ed9f6dd84b18439cb74174e818c4 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -283,17 +283,17 @@ final class HNSWTests: XCTestCase { let transaction = try Transaction.begin(.write, in: env) /// Saving the memory map to disk -// var corpus = try DurableHNSWCorpus( -// encoder: _documentEncoder, -// namespace: "testbasicqueryexampledurable", -// in: transaction -// ) -// -// for doc in docs { -// try corpus.addUntokenizedDocument(doc, in: transaction) -// } -// -// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") + var corpus = try DurableHNSWCorpus( + encoder: _documentEncoder, + namespace: "testbasicqueryexampledurable", + in: transaction + ) + + for doc in docs { + try corpus.addUntokenizedDocument(doc, in: transaction) + } + + corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") try transaction.commit()