Skip to content
Snippets Groups Projects
Commit 5389ad93 authored by Jim Wallace's avatar Jim Wallace
Browse files

Restored DurableHNSWCorpus + adjusted to new SNLPCorpus protocol

parent 4bd282f5
No related branches found
No related tags found
1 merge request!14- Improved ergonomics for generic types: SNLPCorpus, SNLPEncoder, InMemoryCorpus
Pipeline #115905 passed with warnings
Showing
with 395 additions and 357 deletions
......@@ -32,9 +32,9 @@ extension DurableHNSWCorpus {
/// This extension is used for the dictionary operations
public struct DocumentVectorPair {
var untokenizedDocument: String
var vector: [Scalar]
var vector: [Encoder.Scalar]
init(untokenizedDocument: String, vector: [Scalar]) {
init(untokenizedDocument: String, vector: [Encoder.Scalar]) {
self.untokenizedDocument = untokenizedDocument
self.vector = vector
}
......@@ -50,7 +50,7 @@ extension DurableHNSWCorpus {
}
@inlinable
func getVector(at key: Int) -> [Scalar] {
func getVector(at key: Int) -> [Encoder.Scalar] {
if let pair = dictionary[key] {
return pair.vector
} else {
......@@ -63,7 +63,7 @@ extension DurableHNSWCorpus {
return dictionary
}
func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) {
func addDocumentVectorPair(at key: Int, document: String, vector: [Encoder.Scalar]) {
dictionary[key] = DocumentVectorPair(
untokenizedDocument: document,
vector: vector
......
......@@ -131,12 +131,12 @@ extension DurableHNSWCorpus {
let vectorLength = vectorLengthData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
var vector = [Scalar]()
var vector = [Encoder.Scalar]()
for _ in 0..<vectorLength {
let scalarData = data.subdata(in: index..<index+MemoryLayout<Scalar>.size)
let scalar = scalarData.withUnsafeBytes { $0.load(as: Scalar.self) }
let scalarData = data.subdata(in: index..<index+MemoryLayout<Encoder.Scalar>.size)
let scalar = scalarData.withUnsafeBytes { $0.load(as: Encoder.Scalar.self) }
vector.append(scalar)
index += MemoryLayout<Scalar>.size
index += MemoryLayout<Encoder.Scalar>.size
}
// Add the key-value pair to the dictionary
......
//// Copyright (c) 2024 Jim Wallace
////
//// Permission is hereby granted, free of charge, to any person
//// obtaining a copy of this software and associated documentation
//// files (the "Software"), to deal in the Software without
//// restriction, including without limitation the rights to use,
//// copy, modify, merge, publish, distribute, sublicense, and/or sell
//// copies of the Software, and to permit persons to whom the
//// Software is furnished to do so, subject to the following
//// conditions:
////
//// The above copyright notice and this permission notice shall be
//// included in all copies or substantial portions of the Software.
////
//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
//// OTHER DEALINGS IN THE SOFTWARE.
////
//// Created by Mingchung Xia on 2024-03-16.
////
// Copyright (c) 2024 Jim Wallace
//
//#if os(macOS)
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
//import Foundation
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
///// HNSWCorpus iterates through its dictionary of key to document vector pairs
//
//extension DurableHNSWCorpus: Sequence, Collection {
// // Sequence Protocol Requirements
// @inlinable
// func makeIterator() -> AnyIterator<DocumentVectorPair> {
// var iterator = dictionary.values.makeIterator()
// return AnyIterator {
// return iterator.next()
// }
// }
//
// // Collection Protocol Requirements
// @inlinable
// var startIndex: Int {
// return dictionary.keys.sorted().startIndex
// }
//
// @inlinable
// var endIndex: Int {
// return dictionary.keys.sorted().endIndex
// }
//
// @inlinable
// subscript(position: Int) -> DocumentVectorPair {
// let key = dictionary.keys.sorted()[position]
// guard let pair = dictionary[key] else {
// fatalError("Key \(key) not found in HNSW dictionary")
// }
// return pair
// }
//
// @inlinable
// func index(after i: Int) -> Int {
// return dictionary.keys.sorted().index(after: i)
// }
//}
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
//#endif
// Created by Mingchung Xia on 2024-03-16.
//
#if os(macOS)
import Foundation
/// HNSWCorpus iterates through its dictionary of key to document vector pairs
extension DurableHNSWCorpus: Sequence, Collection {
// Sequence Protocol Requirements
@inlinable
func makeIterator() -> AnyIterator<DocumentVectorPair> {
var iterator = dictionary.values.makeIterator()
return AnyIterator {
return iterator.next()
}
}
// Collection Protocol Requirements
@inlinable
var startIndex: Int {
return dictionary.keys.sorted().startIndex
}
@inlinable
var endIndex: Int {
return dictionary.keys.sorted().endIndex
}
@inlinable
subscript(position: Int) -> DocumentVectorPair {
let key = dictionary.keys.sorted()[position]
guard let pair = dictionary[key] else {
fatalError("Key \(key) not found in HNSW dictionary")
}
return pair
}
@inlinable
func index(after i: Int) -> Int {
return dictionary.keys.sorted().index(after: i)
}
}
#endif
......@@ -34,50 +34,87 @@ import CoreLMDBCoders
// MARK: DurableHNSWCorpus cannot conform to SNLPCorpus under its current definition
// This is because addingUntokenizedDocuments in a DurableHNSWCorpus requires an additional parameter (transaction) and can throw
final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemoryLayoutStorableFloat> {
final class DurableHNSWCorpus<Item: SNLPDataItem, Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable & UnsafeMemoryLayoutStorableFloat {
public typealias HNSWDictionary = [Int: DocumentVectorPair]
internal var documentEncoder: any SNLPEncoder<Scalar>
var zeroes: [Scalar] { documentEncoder.zeroes }
var encodedDocuments: DeterministicDurableVectorIndex<Scalar>
var count: Int { encodedDocuments.size }
internal var documentEncoder: Encoder
internal var documents = ContiguousArray<Item>()
internal var encodedDocuments = ContiguousArray<[Encoder.Scalar]>()
var index: DeterministicDurableVectorIndex<Encoder.Scalar>
private let ONE_GB: Int = 1_073_741_824
private let ONE_MB: Int = 1_048_576
private let ONE_KB: Int = 1_024
private let ONE_B: Int = 1
private let DEFAULT_MAXREADERS: UInt32 = 126
private let DEFAULT_MAXDBS: UInt32 = 10
// Keeps track of the original document for client code
var dictionary: HNSWDictionary = [:]
// typicalNeighbourhoodSize = 20 is a standard benchmark
init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
documentEncoder = ContextFreeEncoder(source: encoding)
encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
namespace: namespace,
typicalNeighborhoodSize: typicalNeighborhoodSize,
in: transaction
)
}
// init(encoding: ContextFreeEncoder<Encoder.Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
// documentEncoder = ContextFreeEncoder<Encoder.Scalar>(source: encoding) as! Encoder
//
// index = try DeterministicDurableVectorIndex<Encoder.Scalar>(
// namespace: namespace,
// typicalNeighborhoodSize: typicalNeighborhoodSize,
// in: transaction
// )
// }
init(encoder: any SNLPEncoder<Scalar>, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
init(encoder: Encoder = Encoder(), typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
documentEncoder = encoder
encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
index = try DeterministicDurableVectorIndex<Encoder.Scalar>(
namespace: namespace,
typicalNeighborhoodSize: typicalNeighborhoodSize,
in: transaction
)
}
@inlinable
func addUntokenizedDocument(_ document: String, in transaction: Transaction) throws {
func addUntokenizedDocument(_ document: Item, in transaction: Transaction) throws {
/// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder
/// encodedDocuments.insert will insert and return the corresponding key (id)s
let encodedVector = documentEncoder.encodeSentence(document)
let key = try encodedDocuments.insert(encodedVector, in: transaction)
/// encodedDocuments.insert will insert and return the corresponding key (id)s
documents.append(document)
encodedDocuments.append(documentEncoder.encodeSentence(document.fullText))
assert( documents.count == encodedDocuments.count )
let encodedVector = documentEncoder.encodeSentence(document.fullText)
let key = try index.insert(encodedVector, in: transaction)
addDocumentVectorPair(
at: key,
document: document,
document: document.fullText,
vector: encodedVector
)
}
func searchFor(_ query: String) -> [Item] {
return []
}
// func searchFor(_ query: String, in transaction: Transaction) -> [Item] {
// let queryVector = documentEncoder.encodeToken(query)
// let results = try! index.find(near: queryVector, limit: 8, in: transaction)
//
// return results.map{ documents[$0.id] }
// return []
// }
}
#endif
......@@ -34,7 +34,7 @@ import Foundation
// MARK: Allow EphemeralHNSWCorpus to simply be used as HNSWCorpus
typealias HNSWCorpus = EphemeralHNSWCorpus
final class EphemeralHNSWCorpus<Item: SNLPDataItem,Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable{
final class EphemeralHNSWCorpus<Item: SNLPDataItem,Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable {
public typealias HNSWDictionary = [Int: DocumentVectorPair]
......
......@@ -26,18 +26,19 @@ import Foundation
import CoreML
//class CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
//
// var zeroes: [Scalar]
//
// func encodeToken(_ token: String) -> [Scalar] {
// fatalError("CoreMLEncoder not implemented yet. Get on it.")
// }
//
// func encodeSentence(_ sentence: String) -> [Scalar] {
// fatalError("CoreMLEncoder not implemented yet. Get on it.")
// }
//}
struct CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
var zeroes: [Scalar] = []
var dimensions: UInt = 0
func encodeToken(_ token: String) -> [Scalar] {
fatalError("CoreMLEncoder not implemented yet. Get on it.")
}
func encodeSentence(_ sentence: String) -> [Scalar] {
fatalError("CoreMLEncoder not implemented yet. Get on it.")
}
}
//@available(macOS 13.0, *)
//public class MiniLMEmbeddings {
......
......@@ -25,7 +25,7 @@
import Foundation
import NaturalLanguage
struct NaturalLanguageEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
struct NaturalLanguageEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
var dimensions: UInt = 512
var zeroes: [Scalar] { Array(repeating: Scalar(0), count: Int(dimensions)) }
......
......@@ -23,18 +23,22 @@
import Foundation
//class OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder {
//
//
// var zeroes: [Scalar]
//
// func fetchEncodingForToken(_ token: String) async throws -> [Scalar] {
// fatalError("OpenAIEncoder not implemented. Get on it.")
// }
//
// func fetchEncodingForSentence(_ sentence: String) async throws -> [Scalar] {
// fatalError("OpenAIEncoder not implemented. Get on it.")
// }
//
//
//}
struct OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder {
var zeroes: [Scalar]
var dimensions: UInt
init() {
fatalError()
}
func fetchEncodingForToken(_ token: String) async throws -> [Scalar] {
fatalError("OpenAIEncoder not implemented. Get on it.")
}
func fetchEncodingForSentence(_ sentence: String) async throws -> [Scalar] {
fatalError("OpenAIEncoder not implemented. Get on it.")
}
}
//#if os(macOS)
//import XCTest
//import Foundation
//import CoreLMDB
//import System
//@testable import SwiftNLP
//
//// MARK: These tests are not to be included within the pipeline
//
//final class DurableHNSWCorpusTests: XCTestCase {
// /// This is used to skip these tests in the GitLab pipeline
// override class var defaultTestSuite: XCTestSuite {
// if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
// return XCTestSuite(name: "Empty")
// }
// return super.defaultTestSuite
// }
//
// /// Setting up constants for environment
// private let ONE_GB: Int = 1_073_741_824
// private let ONE_MB: Int = 1_048_576
// private let ONE_KB: Int = 1_024
// private let ONE_B: Int = 1
// private let DEFAULT_MAXREADERS: UInt32 = 126
// private let DEFAULT_MAXDBS: UInt32 = 10
//
// /// Setting up working directory
// private var workingDirectoryPath: FilePath!
//
// override func setUpWithError() throws {
// try super.setUpWithError()
//
#if os(macOS)
import XCTest
import Foundation
import CoreLMDB
import System
@testable import SwiftNLP
// MARK: These tests are not to be included within the pipeline
final class DurableHNSWCorpusTests: XCTestCase {
/// This is used to skip these tests in the GitLab pipeline
override class var defaultTestSuite: XCTestSuite {
if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
return XCTestSuite(name: "Empty")
}
return super.defaultTestSuite
}
/// Setting up constants for environment
private let ONE_GB: Int = 1_073_741_824
private let ONE_MB: Int = 1_048_576
private let ONE_KB: Int = 1_024
private let ONE_B: Int = 1
private let DEFAULT_MAXREADERS: UInt32 = 126
private let DEFAULT_MAXDBS: UInt32 = 10
/// Setting up working directory
private var workingDirectoryPath: FilePath!
override func setUpWithError() throws {
try super.setUpWithError()
let fileManager = FileManager.default
let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
workingDirectoryPath = FilePath(directoryURL.path)
/// This commented out code alternatively works in the XCode bundle resource environment
// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
// let fileManager = FileManager.default
// let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
// try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
// workingDirectoryPath = FilePath(directoryURL.path)
//
// /// This commented out code alternatively works in the XCode bundle resource environment
//// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
//// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
//// let fileManager = FileManager.default
//// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
//// print("Resources directory: \(resourcesDirectoryURL)")
//// workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
// }
//
// func testBuildBasicCorpus() throws {
// let docs = [
// "CNTK formerly known as Computational Network Toolkit",
// "is a free easy-to-use open-source commercial-grade toolkit",
// "that enable us to train deep learning algorithms to learn like the human brain."
// ]
//
// /// Setting up the environment
// let env = try Environment()
// try env.setMapSize(ONE_GB)
// try env.setMaxReaders(DEFAULT_MAXREADERS)
// try env.setMaxDBs(DEFAULT_MAXDBS)
// try env.open(path: workingDirectoryPath)
//
// /// Writing to LMDB
// let transaction = try Transaction.begin(.write, in: env)
//
// let corpus = try DurableHNSWCorpus(
// encoding: .glove6B50d,
// namespace: "testBasicExample",
// in: transaction
// )
//
// for doc in docs {
// try corpus.addUntokenizedDocument(doc, in: transaction)
// }
//
// try transaction.commit()
//
// /// Reading from LMDB
// let readTransaction = try Transaction.begin(.read, in: env)
//
// let _ = try DurableHNSWCorpus(
// encoding: .glove6B50d,
// namespace: "testBasicExample",
// in: readTransaction
// )
//
// readTransaction.abort()
//
// // XCTAssert(readCorpus.count == 3)
// /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
// /// This is because size is only incremented when insertion is called but it is not called when read from disk!
// }
//
// func testQueryBasicCorpus() async throws {
// let docs = [
// "The quick brown fox jumps over the lazy dog",
// "I enjoy taking long walks along the beach at sunset",
// "Advances in neural networks have enabled new AI capabilities",
// "The stock market experienced a significant downturn last week",
// "Cooking a good meal can be both an art and a science",
// "The exploration of space is both challenging and rewarding",
// "Machine learning models are becoming increasingly sophisticated",
// "I love reading about history and ancient civilizations"
// ]
//
// let query = "I like to read about new technology and artificial intelligence"
// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
//
// /// Setting up the environment
// let env = try Environment()
// try env.setMapSize(ONE_GB)
// try env.setMaxReaders(DEFAULT_MAXREADERS)
// try env.setMaxDBs(DEFAULT_MAXDBS)
// try env.open(path: workingDirectoryPath)
//
// let transaction = try Transaction.begin(.write, in: env)
//
// /// Saving the memory map to disk
// let corpus = try DurableHNSWCorpus(
// encoder: documentEncoder,
// namespace: "testBasicQueryExample",
// in: transaction
// )
//
// for doc in docs {
// try corpus.addUntokenizedDocument(doc, in: transaction)
// }
//
// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
//
// try transaction.commit()
//
// do {
// let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
//
// /// Reading the memory map (and dictionary) from disk
// let readTransaction = try Transaction.begin(.write, in: env)
//
// let readCorpus = try DurableHNSWCorpus(
// encoder: documentEncoder,
// namespace: "testBasicQueryExample",
// in: readTransaction
// )
//
// readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
//
// let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
//
// for result in result {
// let key = Int(result.id.foreignKey)!
// print(readCorpus.getUntokenizedDocument(at: key))
// }
// } catch {
// print("Error when trying corpus.encodedDocuments.find(): \(error)")
// }
//
// try transaction.commit()
// }
//
// func testBuildGuelphSubredditCorpus() async throws {
// /// Generates the LMDB durable storage to disk but runs no tests otherwise
//
// /// Setting up the environment
// let env = try Environment()
// try env.setMapSize(ONE_GB)
// try env.setMaxReaders(DEFAULT_MAXREADERS)
// try env.setMaxDBs(DEFAULT_MAXDBS)
// try env.open(path: workingDirectoryPath)
//
// /// Get subreddit data
// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
// fatalError("Failed to find waterloo_submissions.zst in test bundle.")
// }
// guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
// fatalError("Failed to load waterloo_submissions.zst from test bundle.")
// }
//
// let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
//
// let transaction = try Transaction.begin(.write, in: env)
//
// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
//
// let corpus = try DurableHNSWCorpus(
// encoder: documentEncoder,
// namespace: "subreddit_durable",
// in: transaction
// )
//
// /// Add documents to corpus
// for submission in submissions {
// if let text = submission.selftext {
// try corpus.addUntokenizedDocument(text, in: transaction)
// }
// }
//
// /// Save dictionary to disk
// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
//
// try transaction.commit()
// }
//
// func testQueryGuelphSubredditCorpus() async throws {
// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
//
// /// Setting up the environment
// let env = try Environment()
// try env.setMapSize(ONE_GB)
// try env.setMaxReaders(DEFAULT_MAXREADERS)
// try env.setMaxDBs(DEFAULT_MAXDBS)
// try env.open(path: workingDirectoryPath)
//
// /// Reading the memory map (and dictionary) from disk
// let transaction = try Transaction.begin(.read, in: env)
//
// let corpus = try DurableHNSWCorpus(
// encoder: documentEncoder,
// namespace: "subreddit_durable",
// in: transaction
// )
//
// corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
//
// let query = "I love waterloo and I love the geese."
// let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
//
// let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
//
// for result in result {
// let key = Int(result.id.foreignKey)!
// print(corpus.getUntokenizedDocument(at: key))
// }
// }
//}
//#endif
//
// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
// print("Resources directory: \(resourcesDirectoryURL)")
// workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
}
func testBuildBasicCorpus() throws {
let docs = [
"CNTK formerly known as Computational Network Toolkit",
"is a free easy-to-use open-source commercial-grade toolkit",
"that enable us to train deep learning algorithms to learn like the human brain."
]
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Writing to LMDB
let transaction = try Transaction.begin(.write, in: env)
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicExample",
in: transaction
)
for doc in docs {
try corpus.addUntokenizedDocument(doc, in: transaction)
}
try transaction.commit()
/// Reading from LMDB
let readTransaction = try Transaction.begin(.read, in: env)
let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicExample",
in: readTransaction
)
readTransaction.abort()
// XCTAssert(readCorpus.count == 3)
/// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
/// This is because size is only incremented when insertion is called but it is not called when read from disk!
}
func testQueryBasicCorpus() async throws {
let docs = [
"The quick brown fox jumps over the lazy dog",
"I enjoy taking long walks along the beach at sunset",
"Advances in neural networks have enabled new AI capabilities",
"The stock market experienced a significant downturn last week",
"Cooking a good meal can be both an art and a science",
"The exploration of space is both challenging and rewarding",
"Machine learning models are becoming increasingly sophisticated",
"I love reading about history and ancient civilizations"
]
let query = "I like to read about new technology and artificial intelligence"
//let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
let transaction = try Transaction.begin(.write, in: env)
/// Saving the memory map to disk
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicQueryExample",
in: transaction
)
for doc in docs {
try corpus.addUntokenizedDocument(doc, in: transaction)
}
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit()
do {
let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) }
/// Reading the memory map (and dictionary) from disk
let readTransaction = try Transaction.begin(.write, in: env)
let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicQueryExample",
in: readTransaction
)
readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(readCorpus.getUntokenizedDocument(at: key))
}
} catch {
print("Error when trying corpus.encodedDocuments.find(): \(error)")
}
try transaction.commit()
}
func testBuildGuelphSubredditCorpus() async throws {
/// Generates the LMDB durable storage to disk but runs no tests otherwise
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Get subreddit data
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
let transaction = try Transaction.begin(.write, in: env)
let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
encoder: documentEncoder,
namespace: "subreddit_durable",
in: transaction
)
/// Add documents to corpus
for submission in submissions {
if let text = submission.selftext {
try corpus.addUntokenizedDocument(text, in: transaction)
}
}
/// Save dictionary to disk
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit()
}
func testQueryGuelphSubredditCorpus() async throws {
let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Reading the memory map (and dictionary) from disk
let transaction = try Transaction.begin(.read, in: env)
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
encoder: documentEncoder,
namespace: "subreddit_durable",
in: transaction
)
corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
let query = "I love waterloo and I love the geese."
let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(corpus.getUntokenizedDocument(at: key))
}
}
}
#endif
......@@ -165,7 +165,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase {
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
//let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>()
let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>(typicalNeighborhoodSize: 10)
for submission in submissions {
if let text = submission.selftext {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment