Skip to content
Snippets Groups Projects
Commit 7d54bb16 authored by Mingchung Xia's avatar Mingchung Xia
Browse files

Durable subreddit persistence verified

parent 58914b3a
No related branches found
No related tags found
1 merge request!13HNSW Implementation with Testcases
Pipeline #113977 failed
......@@ -175,8 +175,8 @@ final class HNSWTests: XCTestCase {
// dataHandler.saveMemoryMap()
//print("Loaded \(corpus.count) documents.")
let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit")
XCTAssert(corpus.count == 17999)
// let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit")
// XCTAssert(corpus.count == 17999)
}
func testTypicalNeighborhoodSizeExecutionTime() throws {
......@@ -430,6 +430,141 @@ final class HNSWTests: XCTestCase {
print("Error when trying corpus.encodedDocuments.find(): \(error)")
}
}
func testGenerateSubredditDurable() async throws {
/// Generates the LMDB durable storage to disk but runs no tests otherwise
/// Setting up working directory
let fileManager = FileManager.default
let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
let filepath = FilePath(directoryURL.path)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(1_073_741_824) /// 1 GB
try env.setMaxReaders(126) /// default
try env.setMaxDBs(10)
try env.open(path: filepath)
/// Get subreddit data
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
let transaction = try Transaction.begin(.write, in: env)
let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
var corpus = try DurableHNSWCorpus(
encoder: _documentEncoder,
namespace: "subreddit_durable",
in: transaction
)
/// Add documents to corpus
for submission in submissions {
if let text = submission.selftext {
try corpus.addUntokenizedDocument(text, in: transaction)
}
}
/// Save dictionary to disk
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit()
}
func testSubredditDurableQueryExample() async throws {
/// This test case is just sandbox testing and debugging purposes and is not an actual test case
let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up working directory
let fileManager = FileManager.default
let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
let filepath = FilePath(directoryURL.path)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(1_073_741_824) /// 1 GB
try env.setMaxReaders(126) /// default
try env.setMaxDBs(10)
try env.open(path: filepath)
let transaction = try Transaction.begin(.write, in: env)
try transaction.commit()
/// Reading the memory map (and dictionary) from disk
let readTransaction = try Transaction.begin(.write, in: env)
let readCorpus = try DurableHNSWCorpus(
encoder: _documentEncoder,
namespace: "subreddit_durable",
in: readTransaction
)
readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
let query = "I love waterloo and I love the geese."
let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(readCorpus.getUntokenizedDocument(at: key))
}
}
func testAdjustable() async throws {
/// This test case is just sandbox testing and debugging purposes and is not an actual test case
let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up working directory
let fileManager = FileManager.default
let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
let filepath = FilePath(directoryURL.path)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(1_073_741_824) /// 1 GB
try env.setMaxReaders(126) /// default
try env.setMaxDBs(10)
try env.open(path: filepath)
let transaction = try Transaction.begin(.write, in: env)
try transaction.commit()
/// Reading the memory map (and dictionary) from disk
let readTransaction = try Transaction.begin(.write, in: env)
let readCorpus = try DurableHNSWCorpus(
encoder: _documentEncoder,
namespace: "testbasicqueryexampledurable",
in: readTransaction
)
readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
let query = "I like to read about new technology and artificial intelligence"
let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(readCorpus.getUntokenizedDocument(at: key))
}
}
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment