From d5d9c57390a7472c53f2de9b4e395de770103873 Mon Sep 17 00:00:00 2001
From: Mingchung Xia <mingchung.xia@gmail.com>
Date: Sun, 21 Jan 2024 15:06:34 -0500
Subject: [PATCH] Started on HNSWCorpus

---
 .../HNSWCorpus + Sequence.swift               |  6 +++
 .../1. Data Collection/HNSWCorpus.swift       | 52 +++++++++++++++++--
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift
index d4a88a33..686d3fe9 100644
--- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Sequence.swift	
@@ -21,6 +21,10 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
+// MARK: Sequence conformance will be done when HNSWCorpus is complete
+
+/*
+ 
 extension HNSWCorpus: Sequence {
         
     typealias Element = [Scalar]
@@ -54,3 +58,5 @@ extension HNSWCorpus: Sequence {
         return encodedDocuments.index(after: i)
     }
 }
+
+*/
diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift
index 23bfa262..269e183d 100644
--- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift	
@@ -28,6 +28,8 @@ import PriorityHeapAlgorithms
 import SimilarityMetric
 import HNSWAlgorithm
 import HNSWEphemeral
+//import HNSWSample
+import GameplayKit
 
 
 class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
@@ -36,18 +38,27 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
     var zeroes: [Scalar]
     var count: Int { 0 }
     
-    var encodedDocuments: [Int : [Scalar]] = [:] // TODO: This should be replaced by HNSW
+    // var encodedDocuments: [Int : [Scalar]] = [:]
+
+    // MARK: typicalNeighbourhoodSize is unknown
+    var encodedDocuments: DeterministicSampleVectorIndex = DeterministicSampleVectorIndex<[Scalar]>(typicalNeighborhoodSize: 20)
     
     init(_documentEncoder: ContextFreeEncoder<Scalar>) {
         self._documentEncoder = _documentEncoder
         zeroes = Array(repeating: Scalar(0), count: 384)
     }
     
+    // TODO: Complete implementation of addUntokenizedDocument
     @inlinable
     func addUntokenizedDocument(_ document: String) {
-        fatalError("HNSWCorpus not implemented yet. Get on it.")
+        encodedDocuments.insertRandom(document)
     }
     
+    // MARK: HNSW indexes do not support deletion - index must be rebuilt
+
+//    The following code is taken from Tests/HNSWTests/HNSWIndexTests.swift
+//    The test case randomly inserts and randomly queries neighbours.
+//
 //    var index = DeterministicSampleVectorIndex(typicalNeighborhoodSize: 20)
 //    for _ in 0..<100 {
 //        index.insertRandom(range: 0...1)
@@ -65,7 +76,7 @@ class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
 
 
 
-
+// TODO: Continue overwriting these structures: this implementation uses the Vector instead of [Double]
 
 public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint {
     public typealias Index = EphemeralVectorIndex<Int, Int, CartesianDistanceMetric<[Double]>, Void>
@@ -75,6 +86,9 @@ public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where
         base = .init(metric: .init(), config: .unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize))
     }
     
+    private var vectorRNG = DeterministicRandomNumberGenerator(seed: 0)
+    private var graphRNG = DeterministicRandomNumberGenerator(seed: 1)
+    
     public func find(near query: Vector, limit: Int, exact: Bool = false) throws -> [Index.Neighbor] {
         if exact {
             Array(PriorityHeap(base.vectors.enumerated().map {
@@ -86,8 +100,39 @@ public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where
         }
     }
     
+    // Should we be generating random Vector instead of CGPoint? How long is a Vector?
+    
+    public mutating func generateRandom(range: ClosedRange<Double>) -> Vector {
+        /*
+        CGPoint(
+            x: .random(in: range, using: &vectorRNG),
+            y: .random(in: range, using: &vectorRNG)
+        )
+         */
+    }
+    
+    public mutating func insertRandom(range: ClosedRange<Double>) {
+        base.insert(generateRandom(range: range) as! [Double], using: &graphRNG)
+    }
+    
 }
 
+
+struct DeterministicRandomNumberGenerator: RandomNumberGenerator {
+    private let randomSource: GKMersenneTwisterRandomSource
+
+    init(seed: UInt64) {
+        randomSource = GKMersenneTwisterRandomSource(seed: seed)
+    }
+
+    mutating func next() -> UInt64 {
+        let upperBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt()))) << 32
+        let lowerBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt())))
+        return upperBits | lowerBits
+    }
+}
+
+
 public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint{
     public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
         // Naïve cartesian distance
@@ -98,4 +143,3 @@ public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityM
         return sqrt(squaredSum)
     }
 }
-
-- 
GitLab