diff --git a/.swiftpm/xcode/xcshareddata/xcschemes/SwiftNLP-Package.xcscheme b/.swiftpm/xcode/xcshareddata/xcschemes/SwiftNLP-Package.xcscheme deleted file mode 100644 index 91190196cbfd66acd172a4b1ba514d4cd45fbcc4..0000000000000000000000000000000000000000 --- a/.swiftpm/xcode/xcshareddata/xcschemes/SwiftNLP-Package.xcscheme +++ /dev/null @@ -1,127 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<Scheme - LastUpgradeVersion = "1500" - version = "1.7"> - <BuildAction - parallelizeBuildables = "YES" - buildImplicitDependencies = "YES"> - <BuildActionEntries> - <BuildActionEntry - buildForTesting = "YES" - buildForRunning = "YES" - buildForProfiling = "YES" - buildForArchiving = "YES" - buildForAnalyzing = "YES"> - <BuildableReference - BuildableIdentifier = "primary" - BlueprintIdentifier = "SwiftNLP" - BuildableName = "SwiftNLP" - BlueprintName = "SwiftNLP" - ReferencedContainer = "container:"> - </BuildableReference> - </BuildActionEntry> - <BuildActionEntry - buildForTesting = "YES" - buildForRunning = "YES" - buildForProfiling = "YES" - buildForArchiving = "YES" - buildForAnalyzing = "YES"> - <BuildableReference - BuildableIdentifier = "primary" - BlueprintIdentifier = "Minimal" - BuildableName = "Minimal" - BlueprintName = "Minimal" - ReferencedContainer = "container:"> - </BuildableReference> - </BuildActionEntry> - <BuildActionEntry - buildForTesting = "YES" - buildForRunning = "YES" - buildForProfiling = "NO" - buildForArchiving = "NO" - buildForAnalyzing = "YES"> - <BuildableReference - BuildableIdentifier = "primary" - BlueprintIdentifier = "SwiftNLPTests" - BuildableName = "SwiftNLPTests" - BlueprintName = "SwiftNLPTests" - ReferencedContainer = "container:"> - </BuildableReference> - </BuildActionEntry> - </BuildActionEntries> - </BuildAction> - <TestAction - buildConfiguration = "Debug" - selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB" - selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB" - shouldUseLaunchSchemeArgsEnv = "YES" - shouldAutocreateTestPlan = "YES"> - <Testables> - <TestableReference - skipped = "NO"> - <BuildableReference - BuildableIdentifier = "primary" - BlueprintIdentifier = "SwiftNLPTests" - BuildableName = "SwiftNLPTests" - BlueprintName = "SwiftNLPTests" - ReferencedContainer = "container:"> - </BuildableReference> - </TestableReference> - </Testables> - </TestAction> - <LaunchAction - buildConfiguration = "Debug" - selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB" - selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB" - launchStyle = "0" - useCustomWorkingDirectory = "NO" - ignoresPersistentStateOnLaunch = "NO" - debugDocumentVersioning = "YES" - debugServiceExtension = "internal" - allowLocationSimulation = "YES"> - <MacroExpansion> - <BuildableReference - BuildableIdentifier = "primary" - BlueprintIdentifier = "Minimal" - BuildableName = "Minimal" - BlueprintName = "Minimal" - ReferencedContainer = "container:"> - </BuildableReference> - </MacroExpansion> - <EnvironmentVariables> - <EnvironmentVariable - key = "REDDIT_CLIENT_SECRET" - value = "S4P1-g5CMxzTegTwVNn8BSq3_RvrVQ" - isEnabled = "YES"> - </EnvironmentVariable> - <EnvironmentVariable - key = "REDDIT_CLIENT_ID" - value = "Z19OW-YwogDenXivptNW_Q" - isEnabled = "YES"> - </EnvironmentVariable> - </EnvironmentVariables> - </LaunchAction> - <ProfileAction - buildConfiguration = "Release" - shouldUseLaunchSchemeArgsEnv = "YES" - savedToolIdentifier = "" - useCustomWorkingDirectory = "NO" - debugDocumentVersioning = "YES"> - <MacroExpansion> - <BuildableReference - BuildableIdentifier = "primary" - BlueprintIdentifier = "Minimal" - BuildableName = "Minimal" - BlueprintName = "Minimal" - ReferencedContainer = "container:"> - </BuildableReference> - </MacroExpansion> - </ProfileAction> - <AnalyzeAction - buildConfiguration = "Debug"> - </AnalyzeAction> - <ArchiveAction - buildConfiguration = "Release" - revealArchiveInOrganizer = "YES"> - </ArchiveAction> -</Scheme> diff --git a/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift b/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift index f5f9dcb3c5c51356b5f9833bca3b931b2c557aff..d06266f343ec561e76ff880f5df78dbd63d4f5ae 100644 --- a/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift +++ b/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift @@ -21,13 +21,6 @@ // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. -#if os(Darwin) -#warning("OS = DARWIN") -#else -#warning("OS != DARWIN") -#endif - - import Foundation #if canImport(Accelerate) import Surge @@ -63,7 +56,7 @@ class KeyedVectorCorpus: SNLPCorpus { var encoding = defaultDocumentEncoding for word in document { if dictionary.contains(word) { - #if canImport(Surge) + #if canImport(Accelerate) encoding .+= dictionary[word] // Surge in-place #else for i in 0 ..< encoding.count { @@ -74,7 +67,7 @@ class KeyedVectorCorpus: SNLPCorpus { } } - #if canImport(Surge) + #if canImport(Accelerate) encoding /= Double(document.count) #else for i in 0 ..< encoding.count { diff --git a/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift b/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift index fcd03d8635f11b7a2c0b943ec57d767af5d38bba..5bf8210863ece2188e7af38869950f806db41452 100644 --- a/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift +++ b/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift @@ -23,7 +23,7 @@ import Foundation -#if os(Darwin) +#if canImport(Accelerate) import SwiftAnnoy #else #warning("Need an alternative to SwiftAnnoy for Linux") diff --git a/Sources/SwiftNLP/2. Embeddings/SIMDCorpus.swift b/Sources/SwiftNLP/2. Embeddings/SIMDCorpus.swift index 3fc9d5c183814bdc4ab857f6cd94f439e4f34517..771871df4196fce59470bed4495a0936c0ed5e6d 100644 --- a/Sources/SwiftNLP/2. Embeddings/SIMDCorpus.swift +++ b/Sources/SwiftNLP/2. Embeddings/SIMDCorpus.swift @@ -69,22 +69,22 @@ class SIMDCorpus<DocumentEncoding: SIMD>: SNLPCorpus where DocumentEncoding.Scal // } } - func clusterWithKMeans() { - - let kmm = KMeans<DocumentEncoding, String>(labels: ["A", "B", "C", "D", "E", "F", "G"]) - let valuesArray = Array(encodedDocuments.values) - kmm.trainCenters(valuesArray, convergeDistance: 0.05) - - for (label, centroid) in zip(kmm.labels, kmm.centroids) { - print("\(label): \(centroid)") - } - - print("\nClassifications") - for (label, point) in zip(kmm.fit(valuesArray), valuesArray) { - print("\(label): \(point)") - } - - } +// func clusterWithKMeans() { +// +// let kmm = KMeans<DocumentEncoding, String>(labels: ["A", "B", "C", "D", "E", "F", "G"]) +// let valuesArray = Array(encodedDocuments.values) +// kmm.trainCenters(valuesArray, convergeDistance: 0.05) +// +// for (label, centroid) in zip(kmm.labels, kmm.centroids) { +// print("\(label): \(centroid)") +// } +// +// print("\nClassifications") +// for (label, point) in zip(kmm.fit(valuesArray), valuesArray) { +// print("\(label): \(point)") +// } +// +// } } diff --git a/Sources/SwiftNLP/4. Clustering/K-Means.swift b/Sources/SwiftNLP/4. Clustering/K-Means.swift index 087ad7676dd52b147b15f39a6da2c76bf8289b93..8180cd970a03bdce865689e6bcd35b8ab944bca7 100644 --- a/Sources/SwiftNLP/4. Clustering/K-Means.swift +++ b/Sources/SwiftNLP/4. Clustering/K-Means.swift @@ -1,110 +1,110 @@ -import Foundation -import simd - -class KMeans<Vector: SIMD, Label: Hashable> where Vector.Scalar: FloatingPoint { - let numCenters: Int - let labels: [Label] - private(set) var centroids = [Vector]() - - init(labels: [Label]) { - assert(labels.count > 1, "Exception: KMeans with less than 2 centers.") - self.labels = labels - self.numCenters = labels.count - } - - //TODO: Confirm this works? - @inlinable // TODO: Check if this actually improves performance? - public func distance<V: SIMD>(_ x: V, _ y: V) -> V.Scalar where V.Scalar: FloatingPoint { - let difference = x - y - var distance = V.Scalar(0) - for i in difference.indices { - distance += difference[i] * difference[i] - } - return distance.squareRoot() - } - - private func indexOfNearestCenter(_ x: Vector, centers: [Vector]) -> Int { - var nearestDist = Vector.Scalar.greatestFiniteMagnitude - var minIndex = 0 - - for (idx, center) in centers.enumerated() { - //let dist = x.distanceTo(center) - let dist = distance(x,center) - if dist < nearestDist { - minIndex = idx - nearestDist = dist - } - } - return minIndex - } - - func trainCenters(_ points: [Vector], convergeDistance: Vector.Scalar) { - - //points = points.filter{ !containsNaN( $0) } - - // Randomly take k objects from the input data to make the initial centroids. - var centers = reservoirSample(points, k: numCenters) - - var centerMoveDist = Vector.Scalar(0) - repeat { - // This array keeps track of which data points belong to which centroids. - var classification: [[Vector]] = .init(repeating: [], count: numCenters) - - // For each data point, find the centroid that it is closest to. - for p in points { - let classIndex = indexOfNearestCenter(p, centers: centers) - classification[classIndex].append(p) - } - - // Take the average of all the data points that belong to each centroid. - // This moves the centroid to a new position. - // FIXME: Whhy is this giving NaN for first result? - let newCenters = classification.map { assignedPoints in - assignedPoints.reduce(Vector.zero, +) / Vector(repeating: Vector.Scalar(assignedPoints.count)) - } - - // Find out how far each centroid moved since the last iteration. If it's - // only a small distance, then we're done. - centerMoveDist = Vector.Scalar(0) - for idx in 0..<numCenters { - centerMoveDist += distance(centers[idx], newCenters[idx]) //centers[idx].distanceTo(newCenters[idx]) - } - - centers = newCenters - } while centerMoveDist > convergeDistance - - centroids = centers - } - - func fit(_ point: Vector) -> Label { - assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.") - - let centroidIndex = indexOfNearestCenter(point, centers: centroids) - return labels[centroidIndex] - } - - func fit(_ points: [Vector]) -> [Label] { - assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.") - - return points.map(fit) - } -} - -// Pick k random elements from samples -func reservoirSample<T>(_ samples: [T], k: Int) -> [T] { - var result = [T]() - - // Fill the result array with first k elements - for i in 0..<k { - result.append(samples[i]) - } - - // Randomly replace elements from remaining pool - for i in k..<samples.count { - let j = Int(arc4random_uniform(UInt32(i + 1))) - if j < k { - result[j] = samples[i] - } - } - return result -} +//import Foundation +//import simd +// +//class KMeans<Vector: SIMD, Label: Hashable> where Vector.Scalar: FloatingPoint { +// let numCenters: Int +// let labels: [Label] +// private(set) var centroids = [Vector]() +// +// init(labels: [Label]) { +// assert(labels.count > 1, "Exception: KMeans with less than 2 centers.") +// self.labels = labels +// self.numCenters = labels.count +// } +// +// //TODO: Confirm this works? +// @inlinable // TODO: Check if this actually improves performance? +// public func distance<V: SIMD>(_ x: V, _ y: V) -> V.Scalar where V.Scalar: FloatingPoint { +// let difference = x - y +// var distance = V.Scalar(0) +// for i in difference.indices { +// distance += difference[i] * difference[i] +// } +// return distance.squareRoot() +// } +// +// private func indexOfNearestCenter(_ x: Vector, centers: [Vector]) -> Int { +// var nearestDist = Vector.Scalar.greatestFiniteMagnitude +// var minIndex = 0 +// +// for (idx, center) in centers.enumerated() { +// //let dist = x.distanceTo(center) +// let dist = distance(x,center) +// if dist < nearestDist { +// minIndex = idx +// nearestDist = dist +// } +// } +// return minIndex +// } +// +// func trainCenters(_ points: [Vector], convergeDistance: Vector.Scalar) { +// +// //points = points.filter{ !containsNaN( $0) } +// +// // Randomly take k objects from the input data to make the initial centroids. +// var centers = reservoirSample(points, k: numCenters) +// +// var centerMoveDist = Vector.Scalar(0) +// repeat { +// // This array keeps track of which data points belong to which centroids. +// var classification: [[Vector]] = .init(repeating: [], count: numCenters) +// +// // For each data point, find the centroid that it is closest to. +// for p in points { +// let classIndex = indexOfNearestCenter(p, centers: centers) +// classification[classIndex].append(p) +// } +// +// // Take the average of all the data points that belong to each centroid. +// // This moves the centroid to a new position. +// // FIXME: Whhy is this giving NaN for first result? +// let newCenters = classification.map { assignedPoints in +// assignedPoints.reduce(Vector.zero, +) / Vector(repeating: Vector.Scalar(assignedPoints.count)) +// } +// +// // Find out how far each centroid moved since the last iteration. If it's +// // only a small distance, then we're done. +// centerMoveDist = Vector.Scalar(0) +// for idx in 0..<numCenters { +// centerMoveDist += distance(centers[idx], newCenters[idx]) //centers[idx].distanceTo(newCenters[idx]) +// } +// +// centers = newCenters +// } while centerMoveDist > convergeDistance +// +// centroids = centers +// } +// +// func fit(_ point: Vector) -> Label { +// assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.") +// +// let centroidIndex = indexOfNearestCenter(point, centers: centroids) +// return labels[centroidIndex] +// } +// +// func fit(_ points: [Vector]) -> [Label] { +// assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.") +// +// return points.map(fit) +// } +//} +// +//// Pick k random elements from samples +//func reservoirSample<T>(_ samples: [T], k: Int) -> [T] { +// var result = [T]() +// +// // Fill the result array with first k elements +// for i in 0..<k { +// result.append(samples[i]) +// } +// +// // Randomly replace elements from remaining pool +// for i in k..<samples.count { +// let j = Int(arc4random_uniform(UInt32(i + 1))) +// if j < k { +// result[j] = samples[i] +// } +// } +// return result +//} diff --git a/Sources/SwiftNLP/4. Clustering/SIMD + distance.swift b/Sources/SwiftNLP/4. Clustering/SIMD + distance.swift index a278b9eec7621bd841ca2732e694d19c14b8db8f..5137846388fbb44ca2026f0904db145a9fea6d27 100644 --- a/Sources/SwiftNLP/4. Clustering/SIMD + distance.swift +++ b/Sources/SwiftNLP/4. Clustering/SIMD + distance.swift @@ -1,103 +1,103 @@ -import simd - -extension SIMD where Self.Scalar: FloatingPoint { - - // A default Euclidian distance for SIMD when more efficient functions aren't provided - @inlinable - public func distance(_ x: Self, _ y: Self) -> Self.Scalar { - let difference = x - y - let squared = difference * difference - return squared.sum().squareRoot() - } - - @inlinable - public func distance_squared(_ x: Self, _ y: Self) -> Self.Scalar { - let difference = x - y - let squared = difference * difference - return squared.sum() - } -} - -extension SIMD2<Double> { - - @inlinable - public func distance(_ x: Self, _ y: Self) -> Double { - return simd.distance(x, y) - } - - @inlinable - public func distance_squared(_ x: Self, _ y: Self) -> Double { - return simd.distance_squared(x, y) - } - -} - -extension SIMD3<Double> { - - @inlinable - public func distance(_ x: Self, _ y: Self) -> Double { - return simd.distance(x, y) - } - - @inlinable - public func distance_squared(_ x: Self, _ y: Self) -> Double { - return simd.distance_squared(x, y) - } - -} - -extension SIMD4<Double> { - - @inlinable - public func distance(_ x: Self, _ y: Self) -> Double { - return simd.distance(x, y) - } - - @inlinable - public func distance_squared(_ x: Self, _ y: Self) -> Double { - return simd.distance_squared(x, y) - } - - -} - -extension SIMD2<Float> { - - @inlinable - public func distance(_ x: Self, _ y: Self) -> Float { - return simd.distance(x, y) - } - - @inlinable - public func distance_squared(_ x: Self, _ y: Self) -> Float { - return simd.distance_squared(x, y) - } - -} - -extension SIMD3<Float> { - - @inlinable - public func distance(_ x: Self, _ y: Self) -> Float { - return simd.distance(x, y) - } - - @inlinable - public func distance_squared(_ x: Self, _ y: Self) -> Float { - return simd.distance_squared(x, y) - } - -} - -extension SIMD4<Float> { - - @inlinable - public func distance(_ x: Self, _ y: Self) -> Float { - return simd.distance(x, y) - } - - @inlinable - public func distance_squared(_ x: Self, _ y: Self) -> Float { - return simd.distance_squared(x, y) - } -} +//import simd +// +//extension SIMD where Self.Scalar: FloatingPoint { +// +// // A default Euclidian distance for SIMD when more efficient functions aren't provided +// @inlinable +// public func distance(_ x: Self, _ y: Self) -> Self.Scalar { +// let difference = x - y +// let squared = difference * difference +// return squared.sum().squareRoot() +// } +// +// @inlinable +// public func distance_squared(_ x: Self, _ y: Self) -> Self.Scalar { +// let difference = x - y +// let squared = difference * difference +// return squared.sum() +// } +//} +// +//extension SIMD2<Double> { +// +// @inlinable +// public func distance(_ x: Self, _ y: Self) -> Double { +// return simd.distance(x, y) +// } +// +// @inlinable +// public func distance_squared(_ x: Self, _ y: Self) -> Double { +// return simd.distance_squared(x, y) +// } +// +//} +// +//extension SIMD3<Double> { +// +// @inlinable +// public func distance(_ x: Self, _ y: Self) -> Double { +// return simd.distance(x, y) +// } +// +// @inlinable +// public func distance_squared(_ x: Self, _ y: Self) -> Double { +// return simd.distance_squared(x, y) +// } +// +//} +// +//extension SIMD4<Double> { +// +// @inlinable +// public func distance(_ x: Self, _ y: Self) -> Double { +// return simd.distance(x, y) +// } +// +// @inlinable +// public func distance_squared(_ x: Self, _ y: Self) -> Double { +// return simd.distance_squared(x, y) +// } +// +// +//} +// +//extension SIMD2<Float> { +// +// @inlinable +// public func distance(_ x: Self, _ y: Self) -> Float { +// return simd.distance(x, y) +// } +// +// @inlinable +// public func distance_squared(_ x: Self, _ y: Self) -> Float { +// return simd.distance_squared(x, y) +// } +// +//} +// +//extension SIMD3<Float> { +// +// @inlinable +// public func distance(_ x: Self, _ y: Self) -> Float { +// return simd.distance(x, y) +// } +// +// @inlinable +// public func distance_squared(_ x: Self, _ y: Self) -> Float { +// return simd.distance_squared(x, y) +// } +// +//} +// +//extension SIMD4<Float> { +// +// @inlinable +// public func distance(_ x: Self, _ y: Self) -> Float { +// return simd.distance(x, y) +// } +// +// @inlinable +// public func distance_squared(_ x: Self, _ y: Self) -> Float { +// return simd.distance_squared(x, y) +// } +//} diff --git a/Sources/SwiftNLP/4. Clustering/SIMD2 + KDTreePoint.swift b/Sources/SwiftNLP/4. Clustering/SIMD2 + KDTreePoint.swift index 2593f742cd53195d5d62606634893da6ab9f9563..a4d4d6c5d1b2e3b9f8352192753571a05db7e94a 100644 --- a/Sources/SwiftNLP/4. Clustering/SIMD2 + KDTreePoint.swift +++ b/Sources/SwiftNLP/4. Clustering/SIMD2 + KDTreePoint.swift @@ -1,39 +1,39 @@ -import simd -import KDTree - -extension SIMD2: KDTreePoint where Scalar: BinaryFloatingPoint { - - @inlinable - public static var dimensions: Int { self.scalarCount } - - @inlinable - public func kdDimension(_ dimension: Int) -> Double { - return Double(self[dimension]) - } - - //TODO: Confirm that Swift is pulling the "most constrained overload" when available - @inlinable - public func squaredDistance(to otherPoint: Self) -> Double { - let difference = self - otherPoint - let squared = difference * difference - return Double(squared.sum()) - } -} - -extension SIMD2<Double> { - - @inlinable - public func squaredDistance(to otherPoint: Self) -> Double { - return simd.distance_squared(self, otherPoint) - } - -} - -extension SIMD2<Float> { - - @inlinable - public func squaredDistance(to otherPoint: Self) -> Double { - return Double(simd.distance_squared(self, otherPoint)) - } - -} +//import simd +//import KDTree +// +//extension SIMD2: KDTreePoint where Scalar: BinaryFloatingPoint { +// +// @inlinable +// public static var dimensions: Int { self.scalarCount } +// +// @inlinable +// public func kdDimension(_ dimension: Int) -> Double { +// return Double(self[dimension]) +// } +// +// //TODO: Confirm that Swift is pulling the "most constrained overload" when available +// @inlinable +// public func squaredDistance(to otherPoint: Self) -> Double { +// let difference = self - otherPoint +// let squared = difference * difference +// return Double(squared.sum()) +// } +//} +// +//extension SIMD2<Double> { +// +// @inlinable +// public func squaredDistance(to otherPoint: Self) -> Double { +// return simd.distance_squared(self, otherPoint) +// } +// +//} +// +//extension SIMD2<Float> { +// +// @inlinable +// public func squaredDistance(to otherPoint: Self) -> Double { +// return Double(simd.distance_squared(self, otherPoint)) +// } +// +//} diff --git a/Tests/SwiftNLPTests/SNLPLoadDataTests.swift b/Tests/SwiftNLPTests/SNLPLoadDataTests.swift index 9e39c1c5b6be02d99c36f3cde415d258fcd3f006..e86742fc502a89ea4648771c2c52857d62473d7e 100644 --- a/Tests/SwiftNLPTests/SNLPLoadDataTests.swift +++ b/Tests/SwiftNLPTests/SNLPLoadDataTests.swift @@ -39,33 +39,33 @@ final class SwiftNLPLoadDataTests: XCTestCase { // } - func testDocumentReading() throws { - // loads all json data for test documents - let redditCommentJson = TestUtils.loadAllRedditComment() - let redditSubmissionJson = TestUtils.loadAllRedditSubmission() - - let redditComments = redditCommentJson.compactMap { TestUtils.readRedditCommentJson(json: $0) } - let redditSubmissions = redditSubmissionJson.compactMap { TestUtils.readRedditSubmissionJson(json: $0) } - - // Extract body and selftext from each post, and store that for our corpus - let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } + - redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } } - - // Add documents to corpus - var corpus = KeyedVectorCorpus(source: .glove6B50d) - corpus.addDocuments(documents: bodies) - - //print(corpus.encodedDocuments.count) - XCTAssert(corpus.encodedDocuments.count == 28765) - - // Dimensionality reduction - let truncatedCorpus = SIMDCorpus<SIMD2<Double>>(corpus) - //debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created") - - // Clustering / Topic Detection - var topics = StubTopicModel<[Double]>(numberOfTopics: 3) - topics.train(truncatedCorpus) - - //debugPrint(topics) - } +// func testDocumentReading() throws { +// // loads all json data for test documents +// let redditCommentJson = TestUtils.loadAllRedditComment() +// let redditSubmissionJson = TestUtils.loadAllRedditSubmission() +// +// let redditComments = redditCommentJson.compactMap { TestUtils.readRedditCommentJson(json: $0) } +// let redditSubmissions = redditSubmissionJson.compactMap { TestUtils.readRedditSubmissionJson(json: $0) } +// +// // Extract body and selftext from each post, and store that for our corpus +// let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } + +// redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } } +// +// // Add documents to corpus +// var corpus = KeyedVectorCorpus(source: .glove6B50d) +// corpus.addDocuments(documents: bodies) +// +// //print(corpus.encodedDocuments.count) +// XCTAssert(corpus.encodedDocuments.count == 28765) +// +// // Dimensionality reduction +// let truncatedCorpus = SIMDCorpus<SIMD2<Double>>(corpus) +// //debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created") +// +// // Clustering / Topic Detection +// var topics = StubTopicModel<[Double]>(numberOfTopics: 3) +// topics.train(truncatedCorpus) +// +// //debugPrint(topics) +// } }