Skip to content
Snippets Groups Projects
Commit a3421974 authored by Mingchung Xia's avatar Mingchung Xia
Browse files

Code optimization and clarity

parent a9264b06
No related branches found
No related tags found
1 merge request!13HNSW Implementation with Testcases
Pipeline #114208 passed with warnings
......@@ -9,7 +9,7 @@ import Foundation
import CoreLMDB
import CoreLMDBCoders
class DurableHNSWCorpus/*<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus*/ {
final class DurableHNSWCorpus/*<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus*/ {
public typealias Scalar = Double /// This is a placeholder to make things work easier right now
public typealias HNSWDictionary = [Int: DocumentVectorPair]
......
......@@ -9,7 +9,7 @@ import Foundation
// MARK: Decodable conformance is in HNSWCorpus
extension HNSWCorpus: Codable {
extension EphemeralHNSWCorpus: Codable {
enum CodingKeys: String, CodingKey {
case _documentEncoder
case encodedDocuments
......
......@@ -7,7 +7,7 @@
import Foundation
extension HNSWCorpus {
extension EphemeralHNSWCorpus {
/// This extension is used for the dictionary operations
public struct DocumentVectorPair {
var untokenizedDocument: String
......@@ -50,7 +50,7 @@ extension HNSWCorpus {
}
}
extension HNSWCorpus.DocumentVectorPair: Codable where Scalar: Codable {
extension EphemeralHNSWCorpus.DocumentVectorPair: Codable where Scalar: Codable {
enum CodingKeys: String, CodingKey {
case untokenizedDocument
case vector
......
......@@ -21,9 +21,9 @@
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
/// It may be more useful to make the conformances based on the dictionary instead of encodedDocuments
/// HNSWCorpus iterates through its dictionary of key to document vector pairs
extension HNSWCorpus: Sequence, Collection {
extension EphemeralHNSWCorpus: Sequence, Collection {
typealias Element = [Scalar]
......
......@@ -23,7 +23,10 @@
import Foundation
class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
// MARK: Allow EphemeralHNSWCorpus to simply be used as HNSWCorpus
typealias HNSWCorpus = EphemeralHNSWCorpus
final class EphemeralHNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
public typealias HNSWDictionary = [Int: DocumentVectorPair]
......
......@@ -42,17 +42,15 @@ extension DurableVectorIndex {
public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint*/ {
public typealias Vector = [Double]
public typealias Index = DurableVectorIndex<CartesianDistanceMetric<Vector>, Vector.Element>
// public typealias Index = DurableVectorIndex<CosineSimilarityMetric<Vector>, Vector.Element>
public var base: Index
public var typicalNeighborhoodSize: Int
public var size: Int = 0
public var size: Int = 0 // TODO: This size is not set when read from LMDB
private var srng = SeedableRandomNumberGenerator(seed: 1)
// private var drng = DeterministicRandomNumberGenerator(seed: 1)
public init(namespace: String, typicalNeighborhoodSize: Int = 20, in transaction: Transaction) throws {
let metric = CartesianDistanceMetric<Vector>()
// let metric = CosineSimilarityMetric<Vector>()
let config = Config.unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize)
self.base = try Index(
namespace: namespace,
......
#if os(macOS)
import XCTest
import Foundation
import CoreLMDB
import System
@testable import SwiftNLP
final class DurableHNSWCorpusTests: XCTestCase {
/// Setting up constants for environment
private let ONE_GB: Int = 1_073_741_824
private let ONE_MB: Int = 1_048_576
private let ONE_KB: Int = 1_024
private let ONE_B: Int = 1
private let DEFAULT_MAXREADERS: UInt32 = 126
private let DEFAULT_MAXDBS: UInt32 = 10
/// Setting up working directory
private var workingDirectoryPath: FilePath!
override func setUpWithError() throws {
try super.setUpWithError()
let fileManager = FileManager.default
let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
workingDirectoryPath = FilePath(directoryURL.path)
}
func testBasicExample() throws {
let docs = [
"CNTK formerly known as Computational Network Toolkit",
"is a free easy-to-use open-source commercial-grade toolkit",
"that enable us to train deep learning algorithms to learn like the human brain."
]
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Writing to LMDB
let transaction = try Transaction.begin(.write, in: env)
var corpus = try DurableHNSWCorpus(
encoding: .glove6B50d,
namespace: "testBasicExample",
in: transaction
)
for doc in docs {
try corpus.addUntokenizedDocument(doc, in: transaction)
}
try transaction.commit()
/// Reading from LMDB
let readTransaction = try Transaction.begin(.read, in: env)
let readCorpus = try DurableHNSWCorpus(
encoding: .glove6B50d,
namespace: "testBasicExample",
in: readTransaction
)
readTransaction.abort()
// XCTAssert(readCorpus.count == 3)
/// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
/// This is because size is only incremented when insertion is called but it is not called when read from disk!
}
func testBasicQueryExample() async throws {
let docs = [
"The quick brown fox jumps over the lazy dog",
"I enjoy taking long walks along the beach at sunset",
"Advances in neural networks have enabled new AI capabilities",
"The stock market experienced a significant downturn last week",
"Cooking a good meal can be both an art and a science",
"The exploration of space is both challenging and rewarding",
"Machine learning models are becoming increasingly sophisticated",
"I love reading about history and ancient civilizations"
]
let query = "I like to read about new technology and artificial intelligence"
let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
let transaction = try Transaction.begin(.write, in: env)
/// Saving the memory map to disk
var corpus = try DurableHNSWCorpus(
encoder: _documentEncoder,
namespace: "testBasicQueryExample",
in: transaction
)
for doc in docs {
try corpus.addUntokenizedDocument(doc, in: transaction)
}
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit()
do {
let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
/// Reading the memory map (and dictionary) from disk
let readTransaction = try Transaction.begin(.write, in: env)
let readCorpus = try DurableHNSWCorpus(
encoder: _documentEncoder,
namespace: "testBasicQueryExample",
in: readTransaction
)
readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(readCorpus.getUntokenizedDocument(at: key))
}
} catch {
print("Error when trying corpus.encodedDocuments.find(): \(error)")
}
try transaction.commit()
}
func testBuildSubredditCorpus() async throws {
/// Generates the LMDB durable storage to disk but runs no tests otherwise
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Get subreddit data
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
let transaction = try Transaction.begin(.write, in: env)
let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
var corpus = try DurableHNSWCorpus(
encoder: _documentEncoder,
namespace: "subreddit_durable",
in: transaction
)
/// Add documents to corpus
for submission in submissions {
if let text = submission.selftext {
try corpus.addUntokenizedDocument(text, in: transaction)
}
}
/// Save dictionary to disk
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit()
}
func testSubredditQueryExample() async throws {
let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Reading the memory map (and dictionary) from disk
let transaction = try Transaction.begin(.read, in: env)
let corpus = try DurableHNSWCorpus(
encoder: _documentEncoder,
namespace: "subreddit_durable",
in: transaction
)
corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
let query = "I love waterloo and I love the geese."
let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(corpus.getUntokenizedDocument(at: key))
}
}
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment