From 7d54bb1693db10d28fe84369ffbf8a15791a6a55 Mon Sep 17 00:00:00 2001
From: Mingchung Xia <mingchung.xia@gmail.com>
Date: Wed, 13 Mar 2024 01:43:53 -0400
Subject: [PATCH] Durable subreddit persistence verified

---
 .../SwiftNLPTests/2. Encoding/HNSWTests.swift | 139 +++++++++++++++++-
 1 file changed, 137 insertions(+), 2 deletions(-)

diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift
index eb7919a9..1c4f9f2d 100644
--- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift	
+++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift	
@@ -175,8 +175,8 @@ final class HNSWTests: XCTestCase {
 //        dataHandler.saveMemoryMap()
         
         //print("Loaded \(corpus.count) documents.")
-        let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit")
-        XCTAssert(corpus.count == 17999)
+//        let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit")
+//        XCTAssert(corpus.count == 17999)
     }
     
     func testTypicalNeighborhoodSizeExecutionTime() throws {
@@ -430,6 +430,141 @@ final class HNSWTests: XCTestCase {
             print("Error when trying corpus.encodedDocuments.find(): \(error)")
         }
     }
+    
+    func testGenerateSubredditDurable() async throws {
+        /// Generates the LMDB durable storage to disk but runs no tests otherwise
+    
+        /// Setting up working directory
+        let fileManager = FileManager.default
+        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
+        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
+        let filepath = FilePath(directoryURL.path)
+        
+        /// Setting up the environment
+        let env = try Environment()
+        try env.setMapSize(1_073_741_824) /// 1 GB
+        try env.setMaxReaders(126) /// default
+        try env.setMaxDBs(10)
+        try env.open(path: filepath)
+        
+        /// Get subreddit data
+        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
+            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
+        }
+        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
+            fatalError("Failed to load waterloo_submissions.zst from test bundle.")
+        }
+
+        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
+        
+        let transaction = try Transaction.begin(.write, in: env)
+        
+        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        
+        var corpus = try DurableHNSWCorpus(
+            encoder: _documentEncoder,
+            namespace: "subreddit_durable",
+            in: transaction
+        )
+
+        /// Add documents to corpus
+        for submission in submissions {
+            if let text = submission.selftext {
+                try corpus.addUntokenizedDocument(text, in: transaction)
+            }
+        }
+
+        /// Save dictionary to disk
+        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
+        
+        try transaction.commit()
+    }
+    
+    func testSubredditDurableQueryExample() async throws {
+        /// This test case is just sandbox testing and debugging purposes and is not an actual test case
+        
+        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        
+        /// Setting up working directory
+        let fileManager = FileManager.default
+        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
+        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
+        let filepath = FilePath(directoryURL.path)
+        
+        /// Setting up the environment
+        let env = try Environment()
+        try env.setMapSize(1_073_741_824) /// 1 GB
+        try env.setMaxReaders(126) /// default
+        try env.setMaxDBs(10)
+        try env.open(path: filepath)
+        
+        let transaction = try Transaction.begin(.write, in: env)
+        try transaction.commit()
+        
+        /// Reading the memory map (and dictionary) from disk
+        let readTransaction = try Transaction.begin(.write, in: env)
+        
+        let readCorpus = try DurableHNSWCorpus(
+            encoder: _documentEncoder,
+            namespace: "subreddit_durable",
+            in: readTransaction
+        )
+        
+        readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
+        
+        let query = "I love waterloo and I love the geese."
+        let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+        
+        let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
+        
+        for result in result {
+            let key = Int(result.id.foreignKey)!
+            print(readCorpus.getUntokenizedDocument(at: key))
+        }
+    }
+    
+    func testAdjustable() async throws {
+        /// This test case is just sandbox testing and debugging purposes and is not an actual test case
+        
+        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        
+        /// Setting up working directory
+        let fileManager = FileManager.default
+        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
+        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
+        let filepath = FilePath(directoryURL.path)
+        
+        /// Setting up the environment
+        let env = try Environment()
+        try env.setMapSize(1_073_741_824) /// 1 GB
+        try env.setMaxReaders(126) /// default
+        try env.setMaxDBs(10)
+        try env.open(path: filepath)
+        
+        let transaction = try Transaction.begin(.write, in: env)
+        try transaction.commit()
+        
+        /// Reading the memory map (and dictionary) from disk
+        let readTransaction = try Transaction.begin(.write, in: env)
+        
+        let readCorpus = try DurableHNSWCorpus(
+            encoder: _documentEncoder,
+            namespace: "testbasicqueryexampledurable",
+            in: readTransaction
+        )
+        
+        readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
+        
+        let query = "I like to read about new technology and artificial intelligence"
+        let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+        
+        let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
+        
+        for result in result {
+            let key = Int(result.id.foreignKey)!
+            print(readCorpus.getUntokenizedDocument(at: key))
+        }
+    }
 }
 #endif
 
-- 
GitLab