Skip to content
Snippets Groups Projects
Commit 768b1fa4 authored by Mingchung Xia's avatar Mingchung Xia
Browse files

Moved HNSW files to own modules, added seedable RNG

parent 3a30f7f3
No related branches found
No related tags found
1 merge request!13HNSW Implementation with Testcases
Pipeline #110239 passed
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-01-28.
//
import Foundation
import SimilarityMetric
public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
let squaredDifferences = zip(someItem, otherItem).map { (x, y) in (x - y) * (x - y) }
let squaredSum = squaredDifferences.reduce(0, +)
return sqrt(squaredSum)
}
}
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-01-28.
//
#if canImport(GameplayKit) && os(macOS)
import Foundation
import GameplayKit
// MARK: GameplayKit provides a mersenne twister for RNG, but is not available on Linux
// See https://github.com/quells/Squall package for alternative mersenne twister
@available(macOS, introduced: 10.11)
struct DeterministicRandomNumberGenerator: RandomNumberGenerator {
private let randomSource: GKMersenneTwisterRandomSource
init(seed: UInt64) {
randomSource = GKMersenneTwisterRandomSource(seed: seed)
}
mutating func next() -> UInt64 {
let upperBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt()))) << 32
let lowerBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt())))
return upperBits | lowerBits
}
}
#endif
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-01-28.
//
import Foundation
import PriorityHeapModule
import PriorityHeapAlgorithms
import HNSWAlgorithm
import HNSWEphemeral
public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint {
public typealias Index = EphemeralVectorIndex<Int, Int, CartesianDistanceMetric<Vector>, Void>
public var base: Index
public init(typicalNeighborhoodSize: Int) {
base = .init(metric: CartesianDistanceMetric<Vector>(), config: .unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize))
}
private var srng = SeedableRandomNumberGenerator(seed: 1)
// private var drng = DeterministicRandomNumberGenerator(seed: 1)
public func find(near query: Vector, limit: Int, exact: Bool = false) throws -> [Index.Neighbor] {
if exact {
return Array(PriorityHeap(base.vectors.enumerated().map {
let similarity = base.metric.similarity(between: query, $0.element)
return NearbyVector(id: $0.offset, vector: $0.element, priority: similarity)
}).descending().prefix(limit))
} else {
return Array(try base.find(near: query, limit: limit))
}
}
public mutating func insert(_ vector: Vector) {
let convertedVector: [Double] = vector.map{ Double($0) }
if let metricVector = convertedVector as? CartesianDistanceMetric<Vector>.Vector {
/// base.insert will returns an unused 'Key' type but will implicitly modify base
let key = base.insert(metricVector, using: &srng)
// let key = base.insert(metricVector, using: &drng)
} else {
fatalError("Unable to get metric vector")
}
}
}
...@@ -22,13 +22,6 @@ ...@@ -22,13 +22,6 @@
// OTHER DEALINGS IN THE SOFTWARE. // OTHER DEALINGS IN THE SOFTWARE.
import Foundation import Foundation
import PriorityHeapModule
import PriorityHeapAlgorithms
import SimilarityMetric
import HNSWAlgorithm
import HNSWEphemeral
import GameplayKit // Not avaliable on Linux? - try to change to other random
class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
...@@ -39,8 +32,6 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { ...@@ -39,8 +32,6 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
// typicalNeighbourhoodSize = 20 is a standard benchmark // typicalNeighbourhoodSize = 20 is a standard benchmark
var encodedDocuments: DeterministicSampleVectorIndex = DeterministicSampleVectorIndex<[Scalar]>(typicalNeighborhoodSize: 20) var encodedDocuments: DeterministicSampleVectorIndex = DeterministicSampleVectorIndex<[Scalar]>(typicalNeighborhoodSize: 20)
// Map from Key to documentId - similar to DictionaryCorpus
init(_documentEncoder: ContextFreeEncoder<Scalar>) { init(_documentEncoder: ContextFreeEncoder<Scalar>) {
self._documentEncoder = _documentEncoder self._documentEncoder = _documentEncoder
zeroes = Array(repeating: Scalar(0), count: 384) zeroes = Array(repeating: Scalar(0), count: 384)
...@@ -52,60 +43,3 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { ...@@ -52,60 +43,3 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
} }
} }
public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint {
public typealias Index = EphemeralVectorIndex<Int, Int, CartesianDistanceMetric<Vector>, Void>
public var base: Index
public init(typicalNeighborhoodSize: Int) {
base = .init(metric: CartesianDistanceMetric<Vector>(), config: .unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize))
}
private var graphRNG = DeterministicRandomNumberGenerator(seed: 1)
public func find(near query: Vector, limit: Int, exact: Bool = false) throws -> [Index.Neighbor] {
if exact {
return Array(PriorityHeap(base.vectors.enumerated().map {
let similarity = base.metric.similarity(between: query, $0.element)
return NearbyVector(id: $0.offset, vector: $0.element, priority: similarity)
}).descending().prefix(limit))
} else {
return Array(try base.find(near: query, limit: limit))
}
}
public mutating func insert(_ vector: Vector) {
let convertedVector: [Double] = vector.map{ Double($0) }
if let metricVector = convertedVector as? CartesianDistanceMetric<Vector>.Vector {
base.insert(metricVector, using: &graphRNG) /// returns an unused 'Key' type
} else {
fatalError("Unable to get metric vector")
}
}
}
public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
let squaredDifferences = zip(someItem, otherItem).map { (x, y) in (x - y) * (x - y) }
let squaredSum = squaredDifferences.reduce(0, +)
return sqrt(squaredSum)
}
}
struct DeterministicRandomNumberGenerator: RandomNumberGenerator {
// Try another package for this...
private let randomSource: GKMersenneTwisterRandomSource
init(seed: UInt64) {
randomSource = GKMersenneTwisterRandomSource(seed: seed)
}
mutating func next() -> UInt64 {
let upperBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt()))) << 32
let lowerBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt())))
return upperBits | lowerBits
}
}
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-01-28.
//
import Foundation
struct SeedableRandomNumberGenerator: RandomNumberGenerator {
private var seed: UInt64
init(seed: UInt64) {
self.seed = seed
}
mutating func next() -> UInt64 {
let lcg: UInt64 = 6364136223846793005
seed = lcg &* seed &+ 1
return seed
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment