From d5d9c57390a7472c53f2de9b4e395de770103873 Mon Sep 17 00:00:00 2001 From: Mingchung Xia <mingchung.xia@gmail.com> Date: Sun, 21 Jan 2024 15:06:34 -0500 Subject: [PATCH] Started on HNSWCorpus --- .../HNSWCorpus + Sequence.swift | 6 +++ .../1. Data Collection/HNSWCorpus.swift | 52 +++++++++++++++++-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift index d4a88a33..686d3fe9 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift @@ -21,6 +21,10 @@ // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. +// MARK: Sequence conformance will be done when HNSWCorpus is complete + +/* + extension HNSWCorpus: Sequence { typealias Element = [Scalar] @@ -54,3 +58,5 @@ extension HNSWCorpus: Sequence { return encodedDocuments.index(after: i) } } + +*/ diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift index 23bfa262..269e183d 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift @@ -28,6 +28,8 @@ import PriorityHeapAlgorithms import SimilarityMetric import HNSWAlgorithm import HNSWEphemeral +//import HNSWSample +import GameplayKit class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { @@ -36,18 +38,27 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { var zeroes: [Scalar] var count: Int { 0 } - var encodedDocuments: [Int : [Scalar]] = [:] // TODO: This should be replaced by HNSW + // var encodedDocuments: [Int : [Scalar]] = [:] + + // MARK: typicalNeighbourhoodSize is unknown + var encodedDocuments: DeterministicSampleVectorIndex = DeterministicSampleVectorIndex<[Scalar]>(typicalNeighborhoodSize: 20) init(_documentEncoder: ContextFreeEncoder<Scalar>) { self._documentEncoder = _documentEncoder zeroes = Array(repeating: Scalar(0), count: 384) } + // TODO: Complete implementation of addUntokenizedDocument @inlinable func addUntokenizedDocument(_ document: String) { - fatalError("HNSWCorpus not implemented yet. Get on it.") + encodedDocuments.insertRandom(document) } + // MARK: HNSW indexes do not support deletion - index must be rebuilt + +// The following code is taken from Tests/HNSWTests/HNSWIndexTests.swift +// The test case randomly inserts and randomly queries neighbours. +// // var index = DeterministicSampleVectorIndex(typicalNeighborhoodSize: 20) // for _ in 0..<100 { // index.insertRandom(range: 0...1) @@ -65,7 +76,7 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { - +// TODO: Continue overwriting these structures: this implementation uses the Vector instead of [Double] public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint { public typealias Index = EphemeralVectorIndex<Int, Int, CartesianDistanceMetric<[Double]>, Void> @@ -75,6 +86,9 @@ public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where base = .init(metric: .init(), config: .unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize)) } + private var vectorRNG = DeterministicRandomNumberGenerator(seed: 0) + private var graphRNG = DeterministicRandomNumberGenerator(seed: 1) + public func find(near query: Vector, limit: Int, exact: Bool = false) throws -> [Index.Neighbor] { if exact { Array(PriorityHeap(base.vectors.enumerated().map { @@ -86,8 +100,39 @@ public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where } } + // Should we be generating random Vector instead of CGPoint? How long is a Vector? + + public mutating func generateRandom(range: ClosedRange<Double>) -> Vector { + /* + CGPoint( + x: .random(in: range, using: &vectorRNG), + y: .random(in: range, using: &vectorRNG) + ) + */ + } + + public mutating func insertRandom(range: ClosedRange<Double>) { + base.insert(generateRandom(range: range) as! [Double], using: &graphRNG) + } + } + +struct DeterministicRandomNumberGenerator: RandomNumberGenerator { + private let randomSource: GKMersenneTwisterRandomSource + + init(seed: UInt64) { + randomSource = GKMersenneTwisterRandomSource(seed: seed) + } + + mutating func next() -> UInt64 { + let upperBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt()))) << 32 + let lowerBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt()))) + return upperBits | lowerBits + } +} + + public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint{ public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element { // Naïve cartesian distance @@ -98,4 +143,3 @@ public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityM return sqrt(squaredSum) } } - -- GitLab