From 7d54bb1693db10d28fe84369ffbf8a15791a6a55 Mon Sep 17 00:00:00 2001 From: Mingchung Xia <mingchung.xia@gmail.com> Date: Wed, 13 Mar 2024 01:43:53 -0400 Subject: [PATCH] Durable subreddit persistence verified --- .../SwiftNLPTests/2. Encoding/HNSWTests.swift | 139 +++++++++++++++++- 1 file changed, 137 insertions(+), 2 deletions(-) diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index eb7919a9..1c4f9f2d 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -175,8 +175,8 @@ final class HNSWTests: XCTestCase { // dataHandler.saveMemoryMap() //print("Loaded \(corpus.count) documents.") - let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit") - XCTAssert(corpus.count == 17999) +// let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit") +// XCTAssert(corpus.count == 17999) } func testTypicalNeighborhoodSizeExecutionTime() throws { @@ -430,6 +430,141 @@ final class HNSWTests: XCTestCase { print("Error when trying corpus.encodedDocuments.find(): \(error)") } } + + func testGenerateSubredditDurable() async throws { + /// Generates the LMDB durable storage to disk but runs no tests otherwise + + /// Setting up working directory + let fileManager = FileManager.default + let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + let filepath = FilePath(directoryURL.path) + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(1_073_741_824) /// 1 GB + try env.setMaxReaders(126) /// default + try env.setMaxDBs(10) + try env.open(path: filepath) + + /// Get subreddit data + guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + guard let submissionsData = try? Data(contentsOf: submissionsURL) else { + fatalError("Failed to load waterloo_submissions.zst from test bundle.") + } + + let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) + + let transaction = try Transaction.begin(.write, in: env) + + let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + var corpus = try DurableHNSWCorpus( + encoder: _documentEncoder, + namespace: "subreddit_durable", + in: transaction + ) + + /// Add documents to corpus + for submission in submissions { + if let text = submission.selftext { + try corpus.addUntokenizedDocument(text, in: transaction) + } + } + + /// Save dictionary to disk + corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") + + try transaction.commit() + } + + func testSubredditDurableQueryExample() async throws { + /// This test case is just sandbox testing and debugging purposes and is not an actual test case + + let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + /// Setting up working directory + let fileManager = FileManager.default + let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + let filepath = FilePath(directoryURL.path) + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(1_073_741_824) /// 1 GB + try env.setMaxReaders(126) /// default + try env.setMaxDBs(10) + try env.open(path: filepath) + + let transaction = try Transaction.begin(.write, in: env) + try transaction.commit() + + /// Reading the memory map (and dictionary) from disk + let readTransaction = try Transaction.begin(.write, in: env) + + let readCorpus = try DurableHNSWCorpus( + encoder: _documentEncoder, + namespace: "subreddit_durable", + in: readTransaction + ) + + readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") + + let query = "I love waterloo and I love the geese." + let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) } + + let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction) + + for result in result { + let key = Int(result.id.foreignKey)! + print(readCorpus.getUntokenizedDocument(at: key)) + } + } + + func testAdjustable() async throws { + /// This test case is just sandbox testing and debugging purposes and is not an actual test case + + let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + /// Setting up working directory + let fileManager = FileManager.default + let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + let filepath = FilePath(directoryURL.path) + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(1_073_741_824) /// 1 GB + try env.setMaxReaders(126) /// default + try env.setMaxDBs(10) + try env.open(path: filepath) + + let transaction = try Transaction.begin(.write, in: env) + try transaction.commit() + + /// Reading the memory map (and dictionary) from disk + let readTransaction = try Transaction.begin(.write, in: env) + + let readCorpus = try DurableHNSWCorpus( + encoder: _documentEncoder, + namespace: "testbasicqueryexampledurable", + in: readTransaction + ) + + readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") + + let query = "I like to read about new technology and artificial intelligence" + let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) } + + let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction) + + for result in result { + let key = Int(result.id.foreignKey)! + print(readCorpus.getUntokenizedDocument(at: key)) + } + } } #endif -- GitLab