diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/CartesianDistanceMetric.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/CartesianDistanceMetric.swift index 78a500624aa796eeed5983d058d85337a9cc70ac..3b2b7dc6b648b6f1fac6afb7a73aa9ed50fdc440 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/CartesianDistanceMetric.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/CartesianDistanceMetric.swift @@ -30,12 +30,6 @@ import Surge public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint { public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element { -// -// let squaredDifferences = zip(someItem, otherItem).map { (x, y) in (x - y) * (x - y) } -// let squaredSum = squaredDifferences.reduce(0, +) -// -// return sqrt(squaredSum) -// return Vector.Element(Surge.distSq(someItem as! [Double], otherItem as! [Double])) } } diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift index 31b7b4d0d99a475ff5c7e11a50755ade807ccc3b..54d10b68ea0c1424d38a454d58241bb9fc6044f8 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/CosineSimilarityMetric.swift @@ -15,28 +15,6 @@ import Surge public struct CosineSimilarityMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint { public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element { -// /// Convert vectors to Double for Accelerate functions -// let someItemDoubles = someItem.map { Double($0) } -// let otherItemDoubles = otherItem.map { Double($0) } -// -// /// Calculate dot product -// var dotProduct: Double = 0.0 -// vDSP_dotprD(someItemDoubles, 1, otherItemDoubles, 1, &dotProduct, vDSP_Length(someItemDoubles.count)) -// -// /// Calculate magnitude of vectors -// var someItemMagnitudeSquared: Double = 0.0 -// var otherItemMagnitudeSquared: Double = 0.0 -// vDSP_svesqD(someItemDoubles, 1, &someItemMagnitudeSquared, vDSP_Length(someItemDoubles.count)) -// vDSP_svesqD(otherItemDoubles, 1, &otherItemMagnitudeSquared, vDSP_Length(otherItemDoubles.count)) -// let someItemMagnitude = sqrt(someItemMagnitudeSquared) -// let otherItemMagnitude = sqrt(otherItemMagnitudeSquared) -// -// /// Calculate the cosine similarity -// let cosineSimilarity = dotProduct / (someItemMagnitude * otherItemMagnitude) -// -// /// Convert back to type Vector.Element -// return Vector.Element(cosineSimilarity) - // Convert vectors to arrays of Double let someItemDoubles = someItem.map { Double($0) } let otherItemDoubles = otherItem.map { Double($0) } diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift index 057afd45281b066beafc6ef7a86dceaab5497de7..39bb795e7219a0b5365344d057955022e6578db2 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift @@ -46,8 +46,7 @@ public struct DeterministicDurableVectorIndex<VectorComponent: UnsafeMemoryLayou public var typicalNeighborhoodSize: Int public var size: Int = 0 // TODO: This size is not set when read from LMDB - private var srng = SeedableRandomNumberGenerator(seed: 1) - // private var drng = DeterministicRandomNumberGenerator(seed: 1) + private var rng: RandomNumberGenerator public init(namespace: String, typicalNeighborhoodSize: Int = 20, in transaction: Transaction) throws { let metric = CartesianDistanceMetric<Vector>() @@ -59,6 +58,7 @@ public struct DeterministicDurableVectorIndex<VectorComponent: UnsafeMemoryLayou in: transaction ) self.typicalNeighborhoodSize = typicalNeighborhoodSize + self.rng = SeedableRNG(seed: 1) } public func find(near query: Vector, limit: Int, exact: Bool = false, in transaction: Transaction) throws -> [Index.Neighbor] { @@ -76,7 +76,7 @@ public struct DeterministicDurableVectorIndex<VectorComponent: UnsafeMemoryLayou defer { size += 1 } let accessor = try Index.Accessor(for: base, in: transaction) let key = String(size) - accessor.insert(vector, forKey: key, using: &srng) + accessor.insert(vector, forKey: key, using: &rng) return self.size } } diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex.swift index 1029b5846c14a2b171415c02bfd1e49e960f0164..8b36bb690b822f6a945afd9b0005bbc19682b1c2 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex.swift @@ -38,14 +38,14 @@ public struct DeterministicEphemeralVectorIndex<Vector: Collection & Codable> wh public var base: Index public var typicalNeighborhoodSize: Int + private var rng: RandomNumberGenerator + public init(typicalNeighborhoodSize: Int = 20) { base = .init(metric: CartesianDistanceMetric<Vector>(), config: .unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize)) self.typicalNeighborhoodSize = typicalNeighborhoodSize + self.rng = SeedableRNG(seed: 1) } - private var srng = SeedableRandomNumberGenerator(seed: 1) - // private var drng = DeterministicRandomNumberGenerator(seed: 1) - public func find(near query: Vector, limit: Int, exact: Bool = false) throws -> [Index.Neighbor] { if exact { return Array(PriorityHeap(base.vectors.enumerated().map { @@ -62,8 +62,7 @@ public struct DeterministicEphemeralVectorIndex<Vector: Collection & Codable> wh let convertedVector: [Double] = vector.map{ Double($0) } if let metricVector = convertedVector as? CartesianDistanceMetric<Vector>.Vector { /// base.insert will returns a key and inserts the vector into the index - let key = base.insert(metricVector, using: &srng) - // let key = base.insert(metricVector, using: &drng) + let key = base.insert(metricVector, using: &rng) return key } else { fatalError("Unable to get metric vector") diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicRandomNumberGenerator.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/MersenneTwisterRNG.swift similarity index 96% rename from Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicRandomNumberGenerator.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/MersenneTwisterRNG.swift index 0da41b32d33e87f1bab8d8f3363c5c3f8e82395e..f96caec7a389eb94ee226562dd4dddca663e89d2 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicRandomNumberGenerator.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/MersenneTwisterRNG.swift @@ -32,7 +32,7 @@ import GameplayKit // See https://github.com/quells/Squall package for alternative mersenne twister @available(macOS, introduced: 10.11) -struct DeterministicRandomNumberGenerator: RandomNumberGenerator { +struct MersenneTwisterRNG: RandomNumberGenerator { private let randomSource: GKMersenneTwisterRandomSource init(seed: UInt64) { diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRandomNumberGenerator.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRNG.swift similarity index 95% rename from Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRandomNumberGenerator.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRNG.swift index 3d55c0afef377d69c7e260f6084ff9e8b5e0e90f..bd7d52e7d8a2d962b48eee01c1e60973e8b2f5ae 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRandomNumberGenerator.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRNG.swift @@ -26,7 +26,7 @@ import Foundation -struct SeedableRandomNumberGenerator: RandomNumberGenerator { +struct SeedableRNG: RandomNumberGenerator { private var seed: UInt64 init(seed: UInt64) { diff --git a/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap b/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap deleted file mode 100644 index 27ec8504e9b276b85b994e7a800b3d702ad1facc..0000000000000000000000000000000000000000 Binary files a/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap and /dev/null differ diff --git a/Sources/SwiftNLP/Resources/hnsw_testbasicqueryexample.mmap b/Sources/SwiftNLP/Resources/hnsw_testbasicqueryexample.mmap deleted file mode 100644 index fde0a6d18e6963661d8f6dafe5eaa7d62058b3b7..0000000000000000000000000000000000000000 Binary files a/Sources/SwiftNLP/Resources/hnsw_testbasicqueryexample.mmap and /dev/null differ diff --git a/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap b/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap deleted file mode 100644 index 56ba6ddb016fd6d65ae86aeaf7f82666a3d07941..0000000000000000000000000000000000000000 Binary files a/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap and /dev/null differ diff --git a/Tests/SwiftNLPTests/2. Encoding/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift similarity index 100% rename from Tests/SwiftNLPTests/2. Encoding/DurableHNSWCorpusTests.swift rename to Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift diff --git a/Tests/SwiftNLPTests/2. Encoding/EphemeralHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift similarity index 100% rename from Tests/SwiftNLPTests/2. Encoding/EphemeralHNSWCorpusTests.swift rename to Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift