From be9af580b03f5ea36dd1b4eb0614aa3eb622f571 Mon Sep 17 00:00:00 2001
From: Jim Wallace <james.wallace@uwaterloo.ca>
Date: Wed, 3 Apr 2024 15:17:14 -0400
Subject: [PATCH] Simplifying SNLPCorpus, added InMemoryCorpus as new
 comforming type.

---
 .../SNLPCorpus.swift                          |  30 +-
 .../SNLPSearchableCorpus.swift                |  34 --
 ...yCorpus + RangeReplacableColleection.swift |  29 --
 .../DictionaryCorpus + Sequence.swift         |  56 ---
 .../1. Data Collection/DictionaryCorpus.swift |  93 ++--
 .../DurableHNSWCorpus + Sequence.swift        | 130 +++---
 .../DurableHNSWCorpus.swift                   |  10 +-
 .../EphemeralHNSWCorpus + Dictionary.swift    | 170 +++----
 .../EphemeralHNSWCorpus + Sequence.swift      | 126 +++---
 .../EphemeralHNSWCorpus.swift                 | 154 +++----
 .../1. Data Collection/InMemoryCorpus.swift   |  72 +++
 .../String + SNLPDataItem.swift               |   6 +-
 .../HNSW/DurableHNSWCorpusTests.swift         |  18 +-
 .../HNSW/EphemeralHNSWCorpusTests.swift       | 414 +++++++++---------
 .../2. Encoding/ContextFreeEncoderTests.swift |  10 +-
 .../NaturalLanguageEncoderTests.swift         |  10 +-
 16 files changed, 669 insertions(+), 693 deletions(-)
 delete mode 100644 Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPSearchableCorpus.swift
 delete mode 100644 Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + RangeReplacableColleection.swift
 delete mode 100644 Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + Sequence.swift
 create mode 100644 Sources/SwiftNLP/1. Data Collection/InMemoryCorpus.swift

diff --git a/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus.swift b/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus.swift
index 0e17fcdb..3d922641 100644
--- a/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus.swift	
+++ b/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus.swift	
@@ -23,30 +23,46 @@
 
 import Foundation
 
-protocol SNLPCorpus<Scalar>: Collection {
+protocol SNLPCorpus<Item> {
+    
+    associatedtype Item: SNLPDataItem
     
     associatedtype Scalar: BinaryFloatingPoint
     associatedtype Encoder: SNLPEncoder where Encoder.Scalar == Scalar
-    associatedtype Item: SNLPDataItem
+    
+    associatedtype DocumentStorage: RandomAccessCollection & RangeReplaceableCollection where DocumentStorage.Element == Item
+    associatedtype EmbeddingStorage: RandomAccessCollection  & RangeReplaceableCollection where EmbeddingStorage.Element == [Scalar]
+        
+    var documents: DocumentStorage { get set }
+    var encodedDocuments: EmbeddingStorage { get set }
+    
+    var documentEncoder: Encoder { get }
     
     var zeroes: [Scalar] { get }
+    var dimensions: UInt { get }
     var count: Int { get }
     
-    mutating func addUntokenizedDocument(_ document: Item)
-    mutating func addUntokenizedDocuments(_ documents: [Item])
+    func addUntokenizedDocument(_ document: Item)
+    func addUntokenizedDocuments(_ documents: [Item])
+    
+    func searchFor(_ query: String) -> [Item]
 }
 
 
 
 extension SNLPCorpus {
-        
+       
+    
+    var zeroes: [Scalar] { documentEncoder.zeroes }
+    var dimensions: UInt { documentEncoder.dimensions }
+    
     /**
         Adds a series of untokenized documents to the corpus, using default tokenization and text processing
      */
-    @inlinable
-    mutating func addUntokenizedDocuments(_ documents: [Item]) {
+    func addUntokenizedDocuments(_ documents: [Item]) {
         for d in documents {
             addUntokenizedDocument(d)
         }
     }
+        
 }
diff --git a/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPSearchableCorpus.swift b/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPSearchableCorpus.swift
deleted file mode 100644
index 5ca038b8..00000000
--- a/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPSearchableCorpus.swift	
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2024 Jim Wallace
-//
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-//
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-import Foundation
-
-protocol SNLPSearchableCorpus: SNLPCorpus {
-    
-    func isTrained() -> Bool
-    func train()
-    
-    func searchFor(_ query: String) -> [String]
-    
-}
-
diff --git a/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + RangeReplacableColleection.swift b/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + RangeReplacableColleection.swift
deleted file mode 100644
index 744bd8a6..00000000
--- a/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + RangeReplacableColleection.swift	
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2024 Jim Wallace
-//
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-//
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-//extension DictionaryCorpus: RangeReplaceableCollection {
-//        
-//    func replaceSubrange<C: Collection>(_ range: Range<DictionaryCorpus.Index>, with newElements: C) where DictionaryCorpus.Element == C.Element {
-//        
-//    }
-//}
diff --git a/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + Sequence.swift b/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + Sequence.swift
deleted file mode 100644
index c2bde96a..00000000
--- a/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus + Sequence.swift	
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2024 Jim Wallace
-//
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-//
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-extension DictionaryCorpus: Sequence {
-        
-    typealias Element = [Scalar]
-        
-    
-    // Sequence Protocol Requirements
-    @inlinable
-    func makeIterator() -> Dictionary<Int, [Scalar]>.Values.Iterator {
-        return encodedDocuments.values.makeIterator()
-    }
-    
-    
-    // Collection Protocol Requirements
-    @inlinable
-    var startIndex: Dictionary<Int, [Scalar]>.Index {
-        return encodedDocuments.startIndex
-    }
-    
-    @inlinable
-    var endIndex: Dictionary<Int, [Scalar]>.Index {
-        return encodedDocuments.endIndex
-    }
-    
-    @inlinable
-    subscript(position: Dictionary<Int, [Scalar]>.Index) -> [Scalar] {
-        encodedDocuments.values[position]
-    }
-    
-    @inlinable
-    func index(after i: Dictionary<Int, [Scalar]>.Index) -> Dictionary<Int, [Scalar]>.Index {
-        return encodedDocuments.index(after: i)
-    }
-}
diff --git a/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus.swift b/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus.swift
index faf4c4fb..591e44e7 100644
--- a/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/DictionaryCorpus.swift	
@@ -1,48 +1,51 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//import Foundation
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+//final class DictionaryCorpus<Scalar: BinaryFloatingPoint, Encoder: SNLPEncoder, Item: SNLPDataItem>: SNLPCorpus where Encoder.Scalar == Scalar {
+//    
+//                                                            
+//    internal var _documentEncoder: Encoder
+//    var zeroes: [Scalar] { _documentEncoder.zeroes }
+//    
+//    var documents: any RangeReplaceableCollection<Item>
+//    var encodedDocuments: any RangeReplaceableCollection<[Scalar]>
 //
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-import Foundation
-
-final class DictionaryCorpus<Scalar: BinaryFloatingPoint, Encoder: SNLPEncoder, Item: SNLPDataItem>: SNLPCorpus where Encoder.Scalar == Scalar {
-                                                            
-    internal var _documentEncoder: Encoder
-    var zeroes: [Scalar] { _documentEncoder.zeroes }
-    
-    var encodedDocuments: [Int : [Scalar] ] = [:]
-    var count: Int { encodedDocuments.count }
-    
-        
-    init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings) {
-        _documentEncoder = ContextFreeEncoder<Scalar>(source: encoding) as! Encoder
-    }
-    
-    init(encoder: Encoder) {
-        _documentEncoder = encoder
-    }
-        
-    @inlinable
-    func addUntokenizedDocument(_ document: Item) {
-        encodedDocuments[ encodedDocuments.count ] = (_documentEncoder.encodeSentence(document.fullText) )
-    }
-    
-}
+//    var count: Int { encodedDocuments.count }
+//    
+//        
+//    init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings) {
+//        _documentEncoder = ContextFreeEncoder<Scalar>(source: encoding) as! Encoder
+//    }
+//    
+//    init(encoder: Encoder) {
+//        _documentEncoder = encoder
+//    }
+//        
+//    
+//    func addUntokenizedDocument(_ document: Item) {
+//        //encodedDocuments[ encodedDocuments.count ] = (_documentEncoder.encodeSentence(document.fullText) )
+//    }
+//    
+//}
diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift
index 072b2b1d..60cba61e 100644
--- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift	
@@ -1,69 +1,69 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
+////
+//// Created by Mingchung Xia on 2024-03-16.
+////
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//#if os(macOS)
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+//import Foundation
 //
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
+///// HNSWCorpus iterates through its dictionary of key to document vector pairs
+// 
+//extension DurableHNSWCorpus: Sequence, Collection {
+//    // Sequence Protocol Requirements
+//    @inlinable
+//    func makeIterator() -> AnyIterator<DocumentVectorPair> {
+//        var iterator = dictionary.values.makeIterator()
+//        return AnyIterator {
+//            return iterator.next()
+//        }
+//    }
+//    
+//    // Collection Protocol Requirements
+//    @inlinable
+//    var startIndex: Int {
+//        return dictionary.keys.sorted().startIndex
+//    }
+//    
+//    @inlinable
+//    var endIndex: Int {
+//        return dictionary.keys.sorted().endIndex
+//    }
+//    
+//    @inlinable
+//    subscript(position: Int) -> DocumentVectorPair {
+//        let key = dictionary.keys.sorted()[position]
+//        guard let pair = dictionary[key] else {
+//            fatalError("Key \(key) not found in HNSW dictionary")
+//        }
+//        return pair
+//    }
+//    
+//    @inlinable
+//    func index(after i: Int) -> Int {
+//        return dictionary.keys.sorted().index(after: i)
+//    }
+//}
 //
-// Created by Mingchung Xia on 2024-03-16.
-//
-
-#if os(macOS)
-
-import Foundation
-
-/// HNSWCorpus iterates through its dictionary of key to document vector pairs
- 
-extension DurableHNSWCorpus: Sequence, Collection {
-    // Sequence Protocol Requirements
-    @inlinable
-    func makeIterator() -> AnyIterator<DocumentVectorPair> {
-        var iterator = dictionary.values.makeIterator()
-        return AnyIterator {
-            return iterator.next()
-        }
-    }
-    
-    // Collection Protocol Requirements
-    @inlinable
-    var startIndex: Int {
-        return dictionary.keys.sorted().startIndex
-    }
-    
-    @inlinable
-    var endIndex: Int {
-        return dictionary.keys.sorted().endIndex
-    }
-    
-    @inlinable
-    subscript(position: Int) -> DocumentVectorPair {
-        let key = dictionary.keys.sorted()[position]
-        guard let pair = dictionary[key] else {
-            fatalError("Key \(key) not found in HNSW dictionary")
-        }
-        return pair
-    }
-    
-    @inlinable
-    func index(after i: Int) -> Int {
-        return dictionary.keys.sorted().index(after: i)
-    }
-}
-
-#endif
+//#endif
diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift
index 7ced3483..8cee16b5 100644
--- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift	
@@ -37,8 +37,8 @@ import CoreLMDBCoders
 final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemoryLayoutStorableFloat> {
     public typealias HNSWDictionary = [Int: DocumentVectorPair]
     
-    internal var _documentEncoder: any SNLPEncoder<Scalar>
-    var zeroes: [Scalar] { _documentEncoder.zeroes }
+    internal var documentEncoder: any SNLPEncoder<Scalar>
+    var zeroes: [Scalar] { documentEncoder.zeroes }
     
     var encodedDocuments: DeterministicDurableVectorIndex<Scalar>
     var count: Int { encodedDocuments.size }
@@ -48,7 +48,7 @@ final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemo
 
     // typicalNeighbourhoodSize = 20 is a standard benchmark
     init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
-        _documentEncoder = ContextFreeEncoder(source: encoding)
+        documentEncoder = ContextFreeEncoder(source: encoding)
         
         encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
             namespace: namespace,
@@ -58,7 +58,7 @@ final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemo
     }
     
     init(encoder: any SNLPEncoder<Scalar>, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
-        _documentEncoder = encoder
+        documentEncoder = encoder
         encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
             namespace: namespace,
             typicalNeighborhoodSize: typicalNeighborhoodSize,
@@ -70,7 +70,7 @@ final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemo
     func addUntokenizedDocument(_ document: String, in transaction: Transaction) throws {
         /// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder
         /// encodedDocuments.insert will insert and return the corresponding key (id)s        
-        let encodedVector = _documentEncoder.encodeSentence(document)
+        let encodedVector = documentEncoder.encodeSentence(document)
         let key = try encodedDocuments.insert(encodedVector, in: transaction)
         addDocumentVectorPair(
             at: key,
diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Dictionary.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Dictionary.swift
index 1d23ebf6..2ec4bffb 100644
--- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Dictionary.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Dictionary.swift	
@@ -1,89 +1,89 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
+////
+//// Created by Mingchung Xia on 2024-02-14.
+////
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//import Foundation
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-//
-// Created by Mingchung Xia on 2024-02-14.
+////extension EphemeralHNSWCorpus {
+//    /// This extension is used for the dictionary operations
+//    public struct DocumentVectorPair {
+//        var untokenizedDocument: String
+//        var vector: [Scalar]
+//        
+//        init(untokenizedDocument: String, vector: [Scalar]) {
+//            self.untokenizedDocument = untokenizedDocument
+//            self.vector = vector
+//        }
+//    }
+//    
+//    @inlinable
+//    func getUntokenizedDocument(at key: Int) -> String {
+//        if let pair = dictionary[key] {
+//            return pair.untokenizedDocument
+//        } else {
+//            fatalError("Key \(key) not found in HNSW dictionary")
+//        }
+//    }
+//    
+//    @inlinable
+//    func getVector(at key: Int) -> [Scalar] {
+//        if let pair = dictionary[key] {
+//            return pair.vector
+//        } else {
+//            fatalError("Key \(key) not found in HNSW dictionary")
+//        }
+//    }
+//    
+//    @inlinable
+//    func getDictionary() -> [Int: DocumentVectorPair] {
+//        return dictionary
+//    }
+//    
+//    func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) {
+//        dictionary[key] = DocumentVectorPair(
+//            untokenizedDocument: document,
+//            vector: vector
+//        )
+//    }
+//}
 //
-
-import Foundation
-
-extension EphemeralHNSWCorpus {
-    /// This extension is used for the dictionary operations
-    public struct DocumentVectorPair {
-        var untokenizedDocument: String
-        var vector: [Scalar]
-        
-        init(untokenizedDocument: String, vector: [Scalar]) {
-            self.untokenizedDocument = untokenizedDocument
-            self.vector = vector
-        }
-    }
-    
-    @inlinable
-    func getUntokenizedDocument(at key: Int) -> String {
-        if let pair = dictionary[key] {
-            return pair.untokenizedDocument
-        } else {
-            fatalError("Key \(key) not found in HNSW dictionary")
-        }
-    }
-    
-    @inlinable
-    func getVector(at key: Int) -> [Scalar] {
-        if let pair = dictionary[key] {
-            return pair.vector
-        } else {
-            fatalError("Key \(key) not found in HNSW dictionary")
-        }
-    }
-    
-    @inlinable
-    func getDictionary() -> [Int: DocumentVectorPair] {
-        return dictionary
-    }
-    
-    func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) {
-        dictionary[key] = DocumentVectorPair(
-            untokenizedDocument: document,
-            vector: vector
-        )
-    }
-}
-
-extension EphemeralHNSWCorpus.DocumentVectorPair: Codable where Scalar: Codable {
-    enum CodingKeys: String, CodingKey {
-        case untokenizedDocument
-        case vector
-    }
-    
-    internal init(from decoder: Decoder) throws {
-        let container = try decoder.container(keyedBy: CodingKeys.self)
-        untokenizedDocument = try container.decode(String.self, forKey: .untokenizedDocument)
-        vector = try container.decode([Scalar].self, forKey: .vector)
-    }
-    
-//    internal func encode(to encoder: Encoder) throws {
-//        var container = encoder.container(keyedBy: CodingKeys.self)
-//        try container.encode(untokenizedDocument, forKey: .untokenizedDocument)
-//        try container.encode(vector, forKey: .vector)
+//extension EphemeralHNSWCorpus.DocumentVectorPair: Codable where Scalar: Codable {
+//    enum CodingKeys: String, CodingKey {
+//        case untokenizedDocument
+//        case vector
+//    }
+//    
+//    internal init(from decoder: Decoder) throws {
+//        let container = try decoder.container(keyedBy: CodingKeys.self)
+//        untokenizedDocument = try container.decode(String.self, forKey: .untokenizedDocument)
+//        vector = try container.decode([Scalar].self, forKey: .vector)
 //    }
-}
+//    
+////    internal func encode(to encoder: Encoder) throws {
+////        var container = encoder.container(keyedBy: CodingKeys.self)
+////        try container.encode(untokenizedDocument, forKey: .untokenizedDocument)
+////        try container.encode(vector, forKey: .vector)
+////    }
+//}
diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Sequence.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Sequence.swift
index e9670f71..6bb9a927 100644
--- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Sequence.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Sequence.swift	
@@ -1,66 +1,66 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
+////
+//// Created by Mingchung Xia on 2024-02-14.
+////
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//import Foundation
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+///// HNSWCorpus iterates through its dictionary of key to document vector pairs
+// 
+//extension EphemeralHNSWCorpus: Sequence, Collection {
+//    // Sequence Protocol Requirements
+//    @inlinable
+//    func makeIterator() -> AnyIterator<DocumentVectorPair> {
+//        var iterator = dictionary.values.makeIterator()
+//        return AnyIterator {
+//            return iterator.next()
+//        }
+//    }
+//    
+//    // Collection Protocol Requirements
+//    @inlinable
+//    var startIndex: Int {
+//        return dictionary.keys.sorted().startIndex
+//    }
+//    
+//    @inlinable
+//    var endIndex: Int {
+//        return dictionary.keys.sorted().endIndex
+//    }
+//    
+//    @inlinable
+//    subscript(position: Int) -> DocumentVectorPair {
+//        let key = dictionary.keys.sorted()[position]
+//        guard let pair = dictionary[key] else {
+//            fatalError("Key \(key) not found in HNSW dictionary")
+//        }
+//        return pair
+//    }
+//    
+//    @inlinable
+//    func index(after i: Int) -> Int {
+//        return dictionary.keys.sorted().index(after: i)
+//    }
+//}
 //
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-//
-// Created by Mingchung Xia on 2024-02-14.
-//
-
-import Foundation
-
-/// HNSWCorpus iterates through its dictionary of key to document vector pairs
- 
-extension EphemeralHNSWCorpus: Sequence, Collection {
-    // Sequence Protocol Requirements
-    @inlinable
-    func makeIterator() -> AnyIterator<DocumentVectorPair> {
-        var iterator = dictionary.values.makeIterator()
-        return AnyIterator {
-            return iterator.next()
-        }
-    }
-    
-    // Collection Protocol Requirements
-    @inlinable
-    var startIndex: Int {
-        return dictionary.keys.sorted().startIndex
-    }
-    
-    @inlinable
-    var endIndex: Int {
-        return dictionary.keys.sorted().endIndex
-    }
-    
-    @inlinable
-    subscript(position: Int) -> DocumentVectorPair {
-        let key = dictionary.keys.sorted()[position]
-        guard let pair = dictionary[key] else {
-            fatalError("Key \(key) not found in HNSW dictionary")
-        }
-        return pair
-    }
-    
-    @inlinable
-    func index(after i: Int) -> Int {
-        return dictionary.keys.sorted().index(after: i)
-    }
-}
-
diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift
index 740c2eaa..04205260 100644
--- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift	
@@ -1,82 +1,82 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
+////
+//// The HNSW work is based on the original work of Jaden Geller
+//// See the https://github.com/JadenGeller/similarity-topology.git
+//// for reference. The code is used with permission from the author
+//// under the MIT License.
+////
+//// Created by Mingchung Xia on 2024-02-14.
+////
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//import Foundation
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+//// MARK: Allow EphemeralHNSWCorpus to simply be used as HNSWCorpus
+//typealias HNSWCorpus = EphemeralHNSWCorpus
 //
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
+//final class EphemeralHNSWCorpus<Scalar: BinaryFloatingPoint & Codable, Encoder: SNLPEncoder, Item: SNLPDataItem>: SNLPCorpus where Encoder.Scalar == Scalar {
+//    
+//    
+//    public typealias HNSWDictionary = [Int: DocumentVectorPair]
+//    
+//    internal var _documentEncoder: Encoder
+//    var zeroes: [Scalar] { _documentEncoder.zeroes }
+//    
+//    var encodedDocuments: DeterministicEphemeralVectorIndex<[Scalar]>
+//    var count: Int { encodedDocuments.base.vectors.count }
+//    
+//    // Keeps track of the original document for client code
+//    var dictionary: HNSWDictionary = [:]
 //
-// The HNSW work is based on the original work of Jaden Geller
-// See the https://github.com/JadenGeller/similarity-topology.git
-// for reference. The code is used with permission from the author
-// under the MIT License.
-//
-// Created by Mingchung Xia on 2024-02-14.
-//
-
-import Foundation
-
-// MARK: Allow EphemeralHNSWCorpus to simply be used as HNSWCorpus
-typealias HNSWCorpus = EphemeralHNSWCorpus
-
-final class EphemeralHNSWCorpus<Scalar: BinaryFloatingPoint & Codable, Encoder: SNLPEncoder, Item: SNLPDataItem>: SNLPCorpus where Encoder.Scalar == Scalar {
-    
-    
-    public typealias HNSWDictionary = [Int: DocumentVectorPair]
-    
-    internal var _documentEncoder: Encoder
-    var zeroes: [Scalar] { _documentEncoder.zeroes }
-    
-    var encodedDocuments: DeterministicEphemeralVectorIndex<[Scalar]>
-    var count: Int { encodedDocuments.base.vectors.count }
-    
-    // Keeps track of the original document for client code
-    var dictionary: HNSWDictionary = [:]
-
-    // typicalNeighbourhoodSize = 20 is a standard benchmark
-    init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings,
-         typicalNeighborhoodSize: Int = 20) {
-        _documentEncoder = ContextFreeEncoder(source: encoding) as! Encoder
-        encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize)
-    }
-    
-    init(encoder: Encoder, typicalNeighborhoodSize: Int = 20) {
-        _documentEncoder = encoder
-        encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize)
-    }
-    
-//     // Decodable conformance
-//    required init(from decoder: Decoder) throws {
-//        let container = try decoder.container(keyedBy: CodingKeys.self)
-//        _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder)
-//        encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments)
-//        dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary)
+//    // typicalNeighbourhoodSize = 20 is a standard benchmark
+//    init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings,
+//         typicalNeighborhoodSize: Int = 20) {
+//        _documentEncoder = ContextFreeEncoder(source: encoding) as! Encoder
+//        encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize)
+//    }
+//    
+//    init(encoder: Encoder, typicalNeighborhoodSize: Int = 20) {
+//        _documentEncoder = encoder
+//        encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize)
+//    }
+//    
+////     // Decodable conformance
+////    required init(from decoder: Decoder) throws {
+////        let container = try decoder.container(keyedBy: CodingKeys.self)
+////        _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder)
+////        encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments)
+////        dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary)
+////    }
+//    
+//    @inlinable
+//    func addUntokenizedDocument(_ document: Item) {
+//        /// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder
+//        /// encodedDocuments.insert will insert and return the corresponding key (id)
+//        let key = encodedDocuments.insert((_documentEncoder.encodeSentence(document.fullText)) )
+//        addDocumentVectorPair(
+//            at: key,
+//            document: document.fullText,
+//            vector: encodedDocuments.base.vectors[key]
+//        )
 //    }
-    
-    @inlinable
-    func addUntokenizedDocument(_ document: Item) {
-        /// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder
-        /// encodedDocuments.insert will insert and return the corresponding key (id)
-        let key = encodedDocuments.insert((_documentEncoder.encodeSentence(document.fullText)) )
-        addDocumentVectorPair(
-            at: key,
-            document: document.fullText,
-            vector: encodedDocuments.base.vectors[key]
-        )
-    }
-}
+//}
diff --git a/Sources/SwiftNLP/1. Data Collection/InMemoryCorpus.swift b/Sources/SwiftNLP/1. Data Collection/InMemoryCorpus.swift
new file mode 100644
index 00000000..fdb6ea47
--- /dev/null
+++ b/Sources/SwiftNLP/1. Data Collection/InMemoryCorpus.swift	
@@ -0,0 +1,72 @@
+// Copyright (c) 2024 Jim Wallace
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+import Foundation
+
+final class InMemoryCorpus<Scalar: BinaryFloatingPoint, Encoder: SNLPEncoder, Item: SNLPDataItem>: SNLPCorpus where Encoder.Scalar == Scalar {
+    
+
+    internal var documentEncoder: Encoder
+    internal var documents = ContiguousArray<Item>()
+    internal var encodedDocuments = ContiguousArray<[Scalar]>()
+                
+    var count: Int { encodedDocuments.count }
+    
+        
+    init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings) {
+        documentEncoder = ContextFreeEncoder<Scalar>(source: encoding) as! Encoder
+    }
+    
+    init(encoder: Encoder) {
+        documentEncoder = encoder
+    }
+    
+    func addUntokenizedDocument(_ document: Item) {
+        documents.append(document)
+        encodedDocuments.append(documentEncoder.encodeSentence(document.fullText))
+        
+        assert( documents.count == encodedDocuments.count )
+    }
+    
+    func addUntokenizedDocuments(_ documents: [Item]) {
+        for document in documents {
+            addUntokenizedDocument(document)
+        }
+    }
+    
+    /*
+            Implements a naive search function ... better to use a more efficient data structure
+     */
+    func searchFor(_ query: String) -> [Item] {
+        let q = documentEncoder.encodeSentence(query)
+
+        if let index = encodedDocuments.firstIndex(of: q) {
+            return [documents[index]]
+        }
+        
+
+        return []
+    }
+    
+    
+}
diff --git a/Sources/SwiftNLP/1. Data Collection/String + SNLPDataItem.swift b/Sources/SwiftNLP/1. Data Collection/String + SNLPDataItem.swift
index b2f85b37..33a0ddd2 100644
--- a/Sources/SwiftNLP/1. Data Collection/String + SNLPDataItem.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/String + SNLPDataItem.swift	
@@ -7,8 +7,12 @@
 
 import Foundation
 
+/*
+        Provides a bare bones implementation of SNLPDataItem so that String can be used in test cases
+        - Not a particularly reliable set of defaults, but enough to work with text
+ */
 extension String: SNLPDataItem {
-    public var createdOn: Date { Date.now }
+    public var createdOn: Date { Date.distantFuture }
     
     public var id: String { self }
 
diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift
index a81d64d9..f82ca745 100644
--- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift	
+++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift	
@@ -102,7 +102,7 @@ final class DurableHNSWCorpusTests: XCTestCase {
         ]
         
         let query = "I like to read about new technology and artificial intelligence"
-        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
         
         /// Setting up the environment
         let env = try Environment()
@@ -115,7 +115,7 @@ final class DurableHNSWCorpusTests: XCTestCase {
         
         /// Saving the memory map to disk
         let corpus = try DurableHNSWCorpus(
-            encoder: _documentEncoder,
+            encoder: documentEncoder,
             namespace: "testBasicQueryExample",
             in: transaction
         )
@@ -129,13 +129,13 @@ final class DurableHNSWCorpusTests: XCTestCase {
         try transaction.commit()
         
         do {
-            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+            let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
             
             /// Reading the memory map (and dictionary) from disk
             let readTransaction = try Transaction.begin(.write, in: env)
             
             let readCorpus = try DurableHNSWCorpus(
-                encoder: _documentEncoder,
+                encoder: documentEncoder,
                 namespace: "testBasicQueryExample",
                 in: readTransaction
             )
@@ -177,10 +177,10 @@ final class DurableHNSWCorpusTests: XCTestCase {
         
         let transaction = try Transaction.begin(.write, in: env)
         
-        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
         
         let corpus = try DurableHNSWCorpus(
-            encoder: _documentEncoder,
+            encoder: documentEncoder,
             namespace: "subreddit_durable",
             in: transaction
         )
@@ -199,7 +199,7 @@ final class DurableHNSWCorpusTests: XCTestCase {
     }
     
     func testQueryGuelphSubredditCorpus() async throws {
-        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
         
         /// Setting up the environment
         let env = try Environment()
@@ -212,7 +212,7 @@ final class DurableHNSWCorpusTests: XCTestCase {
         let transaction = try Transaction.begin(.read, in: env)
         
         let corpus = try DurableHNSWCorpus(
-            encoder: _documentEncoder,
+            encoder: documentEncoder,
             namespace: "subreddit_durable",
             in: transaction
         )
@@ -220,7 +220,7 @@ final class DurableHNSWCorpusTests: XCTestCase {
         corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
         
         let query = "I love waterloo and I love the geese."
-        let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+        let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
         
         let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
         
diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift
index 158bbcbe..8d3609b1 100644
--- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift	
+++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift	
@@ -1,195 +1,68 @@
-#if os(macOS)
-import XCTest
-import Foundation
-import System
-@testable import SwiftNLP
-
-final class EphemeralHNSWCorpusTests: XCTestCase {
-    // MARK: EphemeralHNSWCorpus can also be used as its typealias HNSWCorpus
-    
-    // Load a small set of documents and confirm that corpus and dictionary are updated accordingly
-    func testBuildBasicCorpus() throws {
-        let docs = [
-            "CNTK formerly known as Computational Network Toolkit",
-            "is a free easy-to-use open-source commercial-grade toolkit",
-            "that enable us to train deep learning algorithms to learn like the human brain."
-         ]
-        
-        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
-        corpus.addUntokenizedDocuments(docs)
-
-        XCTAssert(corpus.count == 3)
-        
-        /// Make sure none of our encodings are zero
-        for item in corpus {
-            XCTAssertNotEqual(item.vector, corpus.zeroes)
-        }
-    }
-    
-    // Load a bigger set of documents and confirm
-    func testBuildLargeCorpus() throws {
-        let twentyQuotes = [
-            "Imagination is more important than knowledge. - Albert Einstein",
-            "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
-            "If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
-            "The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
-            "Science is the belief in the ignorance of experts. - Richard Feynman",
-            "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
-            "Science is the poetry of reality. - Richard Dawkins",
-            "To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
-            "The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
-            "Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
-            "An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
-            "If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
-            "The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
-            "Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
-            "In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
-            "Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
-            "Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
-            "The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
-            "One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
-            "All science is either physics or stamp collecting. - Ernest Rutherford"
-        ]
-        
-        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
-        corpus.addUntokenizedDocuments(twentyQuotes)
-        
-        XCTAssertEqual(corpus.count, 20)
-        
-        /// Make sure none of our encodings are zero
-        for item in corpus {
-            XCTAssertNotEqual(item.vector, corpus.zeroes)
-        }
-    }
-    
-    func testBuildGuelphSubredditCorpus() async throws {
-        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
-            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
-        }
-        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
-            fatalError("Failed to load waterloo_submissions.zst from test bundle.")
-        }
-        
-        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
-        
-        let corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
-        
-        for submission in submissions {
-            if let text = submission.selftext {
-                corpus.addUntokenizedDocument(text)
-            }
-        }
-
-        XCTAssert(corpus.count == 17999)
-    }
-    
-    // Load a small set of documents and confirm that corpus and dictionary are updated accordingly
-    func testQueryBasicCorpus() async throws {
-        let docs = [
-            "The quick brown fox jumps over the lazy dog",
-            "I enjoy taking long walks along the beach at sunset",
-            "Advances in neural networks have enabled new AI capabilities",
-            "The stock market experienced a significant downturn last week",
-            "Cooking a good meal can be both an art and a science",
-            "The exploration of space is both challenging and rewarding",
-            "Machine learning models are becoming increasingly sophisticated",
-            "I love reading about history and ancient civilizations"
-        ]
-
-        let query = "I like to read about new technology and artificial intelligence"
- 
-        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
-        corpus.addUntokenizedDocuments(docs)
-        
-        do {
-            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
-            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 8)
-            
-            for result in results {
-                print(corpus.getUntokenizedDocument(at: result.id))
-            }
-        } catch {
-            print("Error when trying corpus.encodedDocuments.find(): \(error)")
-        }
-    }
-    
-    func testQueryLargeCorpus() async throws {
-        let docs = [
-            "Imagination is more important than knowledge. - Albert Einstein",
-            "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
-            "If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
-            "The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
-            "Science is the belief in the ignorance of experts. - Richard Feynman",
-            "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
-            "Science is the poetry of reality. - Richard Dawkins",
-            "To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
-            "The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
-            "Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
-            "An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
-            "If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
-            "The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
-            "Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
-            "In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
-            "Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
-            "Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
-            "The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
-            "One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
-            "All science is either physics or stamp collecting. - Ernest Rutherford"
-        ]
-
-        let query = "I love Albert Einstein!"
-        
-        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoder: _documentEncoder)
-        corpus.addUntokenizedDocuments(docs)
-        
-        do {
-            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
-            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 8)
-            
-            for result in results {
-                print(corpus.getUntokenizedDocument(at: result.id))
-            }
-        } catch {
-            print("Error when trying corpus.encodedDocuments.find(): \(error)")
-        }
-    }
-    
-    func testQueryGuephSubredditCorpus() async throws {
-        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
-            fatalError("Failed to find guelph_submissions.zst in test bundle.")
-        }
-        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
-            fatalError("Failed to load guelph_submissions.zst from test bundle.")
-        }
-        
-        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
-        
-        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-        let corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoder: _documentEncoder)
-        
-        for submission in submissions {
-            if let text = submission.selftext {
-                corpus.addUntokenizedDocument(text)
-            }
-        }
-        
-        let query = "Mr. Goose is a very important figure at the University of Waterloo."
-
-        do {
-            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
-            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 8)
-            
-            for result in results {
-                print(corpus.getUntokenizedDocument(at: result.id))
-            }
-        } catch {
-            print("Error when trying corpus.encodedDocuments.find(): \(error)")
-        }
-    }
-    
-    func testTypicalNeighborhoodSize() async throws {
+//#if os(macOS)
+//import XCTest
+//import Foundation
+//import System
+//@testable import SwiftNLP
+//
+//final class EphemeralHNSWCorpusTests: XCTestCase {
+//    // MARK: EphemeralHNSWCorpus can also be used as its typealias HNSWCorpus
+//    
+//    // Load a small set of documents and confirm that corpus and dictionary are updated accordingly
+//    func testBuildBasicCorpus() throws {
+//        let docs = [
+//            "CNTK formerly known as Computational Network Toolkit",
+//            "is a free easy-to-use open-source commercial-grade toolkit",
+//            "that enable us to train deep learning algorithms to learn like the human brain."
+//         ]
+//        
+//        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
+//        corpus.addUntokenizedDocuments(docs)
+//
+//        XCTAssert(corpus.count == 3)
+//        
+//        /// Make sure none of our encodings are zero
+//        for item in corpus {
+//            XCTAssertNotEqual(item.vector, corpus.zeroes)
+//        }
+//    }
+//    
+//    // Load a bigger set of documents and confirm
+//    func testBuildLargeCorpus() throws {
+//        let twentyQuotes = [
+//            "Imagination is more important than knowledge. - Albert Einstein",
+//            "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
+//            "If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
+//            "The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
+//            "Science is the belief in the ignorance of experts. - Richard Feynman",
+//            "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
+//            "Science is the poetry of reality. - Richard Dawkins",
+//            "To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
+//            "The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
+//            "Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
+//            "An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
+//            "If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
+//            "The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
+//            "Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
+//            "In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
+//            "Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
+//            "Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
+//            "The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
+//            "One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
+//            "All science is either physics or stamp collecting. - Ernest Rutherford"
+//        ]
+//        
+//        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
+//        corpus.addUntokenizedDocuments(twentyQuotes)
+//        
+//        XCTAssertEqual(corpus.count, 20)
+//        
+//        /// Make sure none of our encodings are zero
+//        for item in corpus {
+//            XCTAssertNotEqual(item.vector, corpus.zeroes)
+//        }
+//    }
+//    
+//    func testBuildGuelphSubredditCorpus() async throws {
 //        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
 //            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
 //        }
@@ -199,24 +72,151 @@ final class EphemeralHNSWCorpusTests: XCTestCase {
 //        
 //        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
 //        
-//        let typicalNeighborhoodSizes = [2, 8, 16, 32, 64, 128, 512, 1028]
+//        let corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
+//        
+//        for submission in submissions {
+//            if let text = submission.selftext {
+//                corpus.addUntokenizedDocument(text)
+//            }
+//        }
+//
+//        XCTAssert(corpus.count == 17999)
+//    }
+//    
+//    // Load a small set of documents and confirm that corpus and dictionary are updated accordingly
+//    func testQueryBasicCorpus() async throws {
+//        let docs = [
+//            "The quick brown fox jumps over the lazy dog",
+//            "I enjoy taking long walks along the beach at sunset",
+//            "Advances in neural networks have enabled new AI capabilities",
+//            "The stock market experienced a significant downturn last week",
+//            "Cooking a good meal can be both an art and a science",
+//            "The exploration of space is both challenging and rewarding",
+//            "Machine learning models are becoming increasingly sophisticated",
+//            "I love reading about history and ancient civilizations"
+//        ]
+//
+//        let query = "I like to read about new technology and artificial intelligence"
+// 
+//        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
+//        corpus.addUntokenizedDocuments(docs)
+//        
+//        do {
+//            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+//            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 8)
+//            
+//            for result in results {
+//                print(corpus.getUntokenizedDocument(at: result.id))
+//            }
+//        } catch {
+//            print("Error when trying corpus.encodedDocuments.find(): \(error)")
+//        }
+//    }
+//    
+//    func testQueryLargeCorpus() async throws {
+//        let docs = [
+//            "Imagination is more important than knowledge. - Albert Einstein",
+//            "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
+//            "If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
+//            "The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
+//            "Science is the belief in the ignorance of experts. - Richard Feynman",
+//            "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
+//            "Science is the poetry of reality. - Richard Dawkins",
+//            "To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
+//            "The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
+//            "Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
+//            "An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
+//            "If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
+//            "The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
+//            "Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
+//            "In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
+//            "Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
+//            "Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
+//            "The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
+//            "One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
+//            "All science is either physics or stamp collecting. - Ernest Rutherford"
+//        ]
+//
+//        let query = "I love Albert Einstein!"
+//        
+//        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        var corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoder: _documentEncoder)
+//        corpus.addUntokenizedDocuments(docs)
 //        
-//        for typicalNeighborhoodSize in typicalNeighborhoodSizes {
-//            let startTime = Date()
-//            var corpus = HNSWCorpus(encoding: .glove6B50d, typicalNeighborhoodSize: typicalNeighborhoodSize)
+//        do {
+//            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+//            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 8)
 //            
-//            for submission in submissions {
-//                if let text = submission.selftext {
-//                    corpus.addUntokenizedDocument(text)
-//                }
+//            for result in results {
+//                print(corpus.getUntokenizedDocument(at: result.id))
 //            }
+//        } catch {
+//            print("Error when trying corpus.encodedDocuments.find(): \(error)")
+//        }
+//    }
+//    
+//    func testQueryGuephSubredditCorpus() async throws {
+//        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
+//            fatalError("Failed to find guelph_submissions.zst in test bundle.")
+//        }
+//        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
+//            fatalError("Failed to load guelph_submissions.zst from test bundle.")
+//        }
+//        
+//        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
+//        
+//        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        let corpus = HNSWCorpus<Double,ContextFreeEncoder,String>(encoder: _documentEncoder)
+//        
+//        for submission in submissions {
+//            if let text = submission.selftext {
+//                corpus.addUntokenizedDocument(text)
+//            }
+//        }
+//        
+//        let query = "Mr. Goose is a very important figure at the University of Waterloo."
 //
-//            XCTAssert(corpus.count == 17999)
+//        do {
+//            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+//            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 8)
 //            
-//            let endTime = Date()
-//            print("Typical neighborhood size \(typicalNeighborhoodSize) took \(endTime.timeIntervalSince(startTime)) seconds.")
+//            for result in results {
+//                print(corpus.getUntokenizedDocument(at: result.id))
+//            }
+//        } catch {
+//            print("Error when trying corpus.encodedDocuments.find(): \(error)")
 //        }
-    }
-}
-#endif
-
+//    }
+//    
+//    func testTypicalNeighborhoodSize() async throws {
+////        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
+////            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
+////        }
+////        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
+////            fatalError("Failed to load waterloo_submissions.zst from test bundle.")
+////        }
+////        
+////        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
+////        
+////        let typicalNeighborhoodSizes = [2, 8, 16, 32, 64, 128, 512, 1028]
+////        
+////        for typicalNeighborhoodSize in typicalNeighborhoodSizes {
+////            let startTime = Date()
+////            var corpus = HNSWCorpus(encoding: .glove6B50d, typicalNeighborhoodSize: typicalNeighborhoodSize)
+////            
+////            for submission in submissions {
+////                if let text = submission.selftext {
+////                    corpus.addUntokenizedDocument(text)
+////                }
+////            }
+////
+////            XCTAssert(corpus.count == 17999)
+////            
+////            let endTime = Date()
+////            print("Typical neighborhood size \(typicalNeighborhoodSize) took \(endTime.timeIntervalSince(startTime)) seconds.")
+////        }
+//    }
+//}
+//#endif
+//
diff --git a/Tests/SwiftNLPTests/2. Encoding/ContextFreeEncoderTests.swift b/Tests/SwiftNLPTests/2. Encoding/ContextFreeEncoderTests.swift
index 78550cc8..e7b9c063 100644
--- a/Tests/SwiftNLPTests/2. Encoding/ContextFreeEncoderTests.swift	
+++ b/Tests/SwiftNLPTests/2. Encoding/ContextFreeEncoderTests.swift	
@@ -13,13 +13,13 @@ final class ContextFreeEncoderTests: XCTestCase {
             "that enable us to train deep learning algorithms to learn like the human brain."
          ]
         
-        var corpus = DictionaryCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
+        var corpus = InMemoryCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
         corpus.addUntokenizedDocuments(docs)
         
         XCTAssert(corpus.encodedDocuments.count == 3)
         
         // Make sure none of our encodings are zero
-        for c in corpus {
+        for c in corpus.encodedDocuments {
             XCTAssertNotEqual(c, corpus.zeroes)
         }
     }
@@ -50,14 +50,14 @@ final class ContextFreeEncoderTests: XCTestCase {
             "All science is either physics or stamp collecting. - Ernest Rutherford"
         ]
         
-        var corpus = DictionaryCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
+        var corpus = InMemoryCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
         corpus.addUntokenizedDocuments(twentyQuotes)
         
         
         XCTAssertEqual(corpus.encodedDocuments.count, 20)
         
         // Make sure none of our encodings are zero
-        for c in corpus {
+        for c in corpus.encodedDocuments {
             XCTAssertNotEqual(c, corpus.zeroes)
         }
     }
@@ -75,7 +75,7 @@ final class ContextFreeEncoderTests: XCTestCase {
         
         //print("Errors: \(errors.count)")
         
-        let corpus = DictionaryCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
+        var corpus = InMemoryCorpus<Double,ContextFreeEncoder,String>(encoding: .glove6B50d)
         for submission in submissions {
             if let text = submission.selftext {
                 corpus.addUntokenizedDocument(text)
diff --git a/Tests/SwiftNLPTests/2. Encoding/NaturalLanguageEncoderTests.swift b/Tests/SwiftNLPTests/2. Encoding/NaturalLanguageEncoderTests.swift
index f40a7b7d..6c49181c 100644
--- a/Tests/SwiftNLPTests/2. Encoding/NaturalLanguageEncoderTests.swift	
+++ b/Tests/SwiftNLPTests/2. Encoding/NaturalLanguageEncoderTests.swift	
@@ -15,13 +15,13 @@ final class NaturalLanguageEncoderTests: XCTestCase {
          ]
         
         let encoder = NaturalLanguageEncoder<Double>()
-        var corpus = DictionaryCorpus<Double,NaturalLanguageEncoder,String>(encoder: encoder)
+        var corpus = InMemoryCorpus<Double,NaturalLanguageEncoder,String>(encoder: encoder)
         corpus.addUntokenizedDocuments(docs)
         
         XCTAssert(corpus.encodedDocuments.count == 3)
         
         // Make sure none of our encodings are zero
-        for c in corpus {
+        for c in corpus.encodedDocuments {
             XCTAssertNotEqual(c, corpus.zeroes)
         }
     }
@@ -53,14 +53,14 @@ final class NaturalLanguageEncoderTests: XCTestCase {
         ]
         
         let encoder = NaturalLanguageEncoder<Double>()
-        var corpus = DictionaryCorpus<Double,NaturalLanguageEncoder,String>(encoder: encoder)
+        var corpus = InMemoryCorpus<Double,NaturalLanguageEncoder,String>(encoder: encoder)
         corpus.addUntokenizedDocuments(twentyQuotes)
         
         
         XCTAssertEqual(corpus.encodedDocuments.count, 20)
         
         // Make sure none of our encodings are zero
-        for c in corpus {
+        for c in corpus.encodedDocuments {
             XCTAssertNotEqual(c, corpus.zeroes)
         }
     }
@@ -77,7 +77,7 @@ final class NaturalLanguageEncoderTests: XCTestCase {
         let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
         
         let encoder = NaturalLanguageEncoder<Double>()
-        let corpus = DictionaryCorpus<Double,NaturalLanguageEncoder,String>(encoder: encoder)
+        let corpus = InMemoryCorpus<Double,NaturalLanguageEncoder,String>(encoder: encoder)
         
         for submission in submissions {
             if let text = submission.selftext {
-- 
GitLab