Skip to content
Snippets Groups Projects
Commit 03f109b2 authored by Jim Wallace's avatar Jim Wallace
Browse files

Fixing linux compatibility

parent 095f0aa0
No related branches found
No related tags found
No related merge requests found
Pipeline #108404 passed with warnings
<?xml version="1.0" encoding="UTF-8"?>
<Scheme
LastUpgradeVersion = "1500"
version = "1.7">
<BuildAction
parallelizeBuildables = "YES"
buildImplicitDependencies = "YES">
<BuildActionEntries>
<BuildActionEntry
buildForTesting = "YES"
buildForRunning = "YES"
buildForProfiling = "YES"
buildForArchiving = "YES"
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "SwiftNLP"
BuildableName = "SwiftNLP"
BlueprintName = "SwiftNLP"
ReferencedContainer = "container:">
</BuildableReference>
</BuildActionEntry>
<BuildActionEntry
buildForTesting = "YES"
buildForRunning = "YES"
buildForProfiling = "YES"
buildForArchiving = "YES"
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "Minimal"
BuildableName = "Minimal"
BlueprintName = "Minimal"
ReferencedContainer = "container:">
</BuildableReference>
</BuildActionEntry>
<BuildActionEntry
buildForTesting = "YES"
buildForRunning = "YES"
buildForProfiling = "NO"
buildForArchiving = "NO"
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "SwiftNLPTests"
BuildableName = "SwiftNLPTests"
BlueprintName = "SwiftNLPTests"
ReferencedContainer = "container:">
</BuildableReference>
</BuildActionEntry>
</BuildActionEntries>
</BuildAction>
<TestAction
buildConfiguration = "Debug"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
shouldUseLaunchSchemeArgsEnv = "YES"
shouldAutocreateTestPlan = "YES">
<Testables>
<TestableReference
skipped = "NO">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "SwiftNLPTests"
BuildableName = "SwiftNLPTests"
BlueprintName = "SwiftNLPTests"
ReferencedContainer = "container:">
</BuildableReference>
</TestableReference>
</Testables>
</TestAction>
<LaunchAction
buildConfiguration = "Debug"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
launchStyle = "0"
useCustomWorkingDirectory = "NO"
ignoresPersistentStateOnLaunch = "NO"
debugDocumentVersioning = "YES"
debugServiceExtension = "internal"
allowLocationSimulation = "YES">
<MacroExpansion>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "Minimal"
BuildableName = "Minimal"
BlueprintName = "Minimal"
ReferencedContainer = "container:">
</BuildableReference>
</MacroExpansion>
<EnvironmentVariables>
<EnvironmentVariable
key = "REDDIT_CLIENT_SECRET"
value = "S4P1-g5CMxzTegTwVNn8BSq3_RvrVQ"
isEnabled = "YES">
</EnvironmentVariable>
<EnvironmentVariable
key = "REDDIT_CLIENT_ID"
value = "Z19OW-YwogDenXivptNW_Q"
isEnabled = "YES">
</EnvironmentVariable>
</EnvironmentVariables>
</LaunchAction>
<ProfileAction
buildConfiguration = "Release"
shouldUseLaunchSchemeArgsEnv = "YES"
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
debugDocumentVersioning = "YES">
<MacroExpansion>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "Minimal"
BuildableName = "Minimal"
BlueprintName = "Minimal"
ReferencedContainer = "container:">
</BuildableReference>
</MacroExpansion>
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">
</AnalyzeAction>
<ArchiveAction
buildConfiguration = "Release"
revealArchiveInOrganizer = "YES">
</ArchiveAction>
</Scheme>
......@@ -21,15 +21,8 @@
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#if os(Darwin)
#warning("OS = DARWIN")
#else
#warning("OS != DARWIN")
#endif
import Foundation
#if canImport(Accelerate)
#if os(Darwin)
import Surge
#else
//TODO: Implement Linux alternative with better performance, currently uses naive solution.
......@@ -63,7 +56,7 @@ class KeyedVectorCorpus: SNLPCorpus {
var encoding = defaultDocumentEncoding
for word in document {
if dictionary.contains(word) {
#if canImport(Surge)
#if os(Darwin)
encoding .+= dictionary[word] // Surge in-place
#else
for i in 0 ..< encoding.count {
......@@ -74,7 +67,7 @@ class KeyedVectorCorpus: SNLPCorpus {
}
}
#if canImport(Surge)
#if os(Darwin)
encoding /= Double(document.count)
#else
for i in 0 ..< encoding.count {
......
......@@ -69,22 +69,22 @@ class SIMDCorpus<DocumentEncoding: SIMD>: SNLPCorpus where DocumentEncoding.Scal
// }
}
func clusterWithKMeans() {
let kmm = KMeans<DocumentEncoding, String>(labels: ["A", "B", "C", "D", "E", "F", "G"])
let valuesArray = Array(encodedDocuments.values)
kmm.trainCenters(valuesArray, convergeDistance: 0.05)
for (label, centroid) in zip(kmm.labels, kmm.centroids) {
print("\(label): \(centroid)")
}
print("\nClassifications")
for (label, point) in zip(kmm.fit(valuesArray), valuesArray) {
print("\(label): \(point)")
}
}
// func clusterWithKMeans() {
//
// let kmm = KMeans<DocumentEncoding, String>(labels: ["A", "B", "C", "D", "E", "F", "G"])
// let valuesArray = Array(encodedDocuments.values)
// kmm.trainCenters(valuesArray, convergeDistance: 0.05)
//
// for (label, centroid) in zip(kmm.labels, kmm.centroids) {
// print("\(label): \(centroid)")
// }
//
// print("\nClassifications")
// for (label, point) in zip(kmm.fit(valuesArray), valuesArray) {
// print("\(label): \(point)")
// }
//
// }
}
......
import Foundation
import simd
class KMeans<Vector: SIMD, Label: Hashable> where Vector.Scalar: FloatingPoint {
let numCenters: Int
let labels: [Label]
private(set) var centroids = [Vector]()
init(labels: [Label]) {
assert(labels.count > 1, "Exception: KMeans with less than 2 centers.")
self.labels = labels
self.numCenters = labels.count
}
//TODO: Confirm this works?
@inlinable // TODO: Check if this actually improves performance?
public func distance<V: SIMD>(_ x: V, _ y: V) -> V.Scalar where V.Scalar: FloatingPoint {
let difference = x - y
var distance = V.Scalar(0)
for i in difference.indices {
distance += difference[i] * difference[i]
}
return distance.squareRoot()
}
private func indexOfNearestCenter(_ x: Vector, centers: [Vector]) -> Int {
var nearestDist = Vector.Scalar.greatestFiniteMagnitude
var minIndex = 0
for (idx, center) in centers.enumerated() {
//let dist = x.distanceTo(center)
let dist = distance(x,center)
if dist < nearestDist {
minIndex = idx
nearestDist = dist
}
}
return minIndex
}
func trainCenters(_ points: [Vector], convergeDistance: Vector.Scalar) {
//points = points.filter{ !containsNaN( $0) }
// Randomly take k objects from the input data to make the initial centroids.
var centers = reservoirSample(points, k: numCenters)
var centerMoveDist = Vector.Scalar(0)
repeat {
// This array keeps track of which data points belong to which centroids.
var classification: [[Vector]] = .init(repeating: [], count: numCenters)
// For each data point, find the centroid that it is closest to.
for p in points {
let classIndex = indexOfNearestCenter(p, centers: centers)
classification[classIndex].append(p)
}
// Take the average of all the data points that belong to each centroid.
// This moves the centroid to a new position.
// FIXME: Whhy is this giving NaN for first result?
let newCenters = classification.map { assignedPoints in
assignedPoints.reduce(Vector.zero, +) / Vector(repeating: Vector.Scalar(assignedPoints.count))
}
// Find out how far each centroid moved since the last iteration. If it's
// only a small distance, then we're done.
centerMoveDist = Vector.Scalar(0)
for idx in 0..<numCenters {
centerMoveDist += distance(centers[idx], newCenters[idx]) //centers[idx].distanceTo(newCenters[idx])
}
centers = newCenters
} while centerMoveDist > convergeDistance
centroids = centers
}
func fit(_ point: Vector) -> Label {
assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.")
let centroidIndex = indexOfNearestCenter(point, centers: centroids)
return labels[centroidIndex]
}
func fit(_ points: [Vector]) -> [Label] {
assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.")
return points.map(fit)
}
}
// Pick k random elements from samples
func reservoirSample<T>(_ samples: [T], k: Int) -> [T] {
var result = [T]()
// Fill the result array with first k elements
for i in 0..<k {
result.append(samples[i])
}
// Randomly replace elements from remaining pool
for i in k..<samples.count {
let j = Int(arc4random_uniform(UInt32(i + 1)))
if j < k {
result[j] = samples[i]
}
}
return result
}
//import Foundation
//import simd
//
//class KMeans<Vector: SIMD, Label: Hashable> where Vector.Scalar: FloatingPoint {
// let numCenters: Int
// let labels: [Label]
// private(set) var centroids = [Vector]()
//
// init(labels: [Label]) {
// assert(labels.count > 1, "Exception: KMeans with less than 2 centers.")
// self.labels = labels
// self.numCenters = labels.count
// }
//
// //TODO: Confirm this works?
// @inlinable // TODO: Check if this actually improves performance?
// public func distance<V: SIMD>(_ x: V, _ y: V) -> V.Scalar where V.Scalar: FloatingPoint {
// let difference = x - y
// var distance = V.Scalar(0)
// for i in difference.indices {
// distance += difference[i] * difference[i]
// }
// return distance.squareRoot()
// }
//
// private func indexOfNearestCenter(_ x: Vector, centers: [Vector]) -> Int {
// var nearestDist = Vector.Scalar.greatestFiniteMagnitude
// var minIndex = 0
//
// for (idx, center) in centers.enumerated() {
// //let dist = x.distanceTo(center)
// let dist = distance(x,center)
// if dist < nearestDist {
// minIndex = idx
// nearestDist = dist
// }
// }
// return minIndex
// }
//
// func trainCenters(_ points: [Vector], convergeDistance: Vector.Scalar) {
//
// //points = points.filter{ !containsNaN( $0) }
//
// // Randomly take k objects from the input data to make the initial centroids.
// var centers = reservoirSample(points, k: numCenters)
//
// var centerMoveDist = Vector.Scalar(0)
// repeat {
// // This array keeps track of which data points belong to which centroids.
// var classification: [[Vector]] = .init(repeating: [], count: numCenters)
//
// // For each data point, find the centroid that it is closest to.
// for p in points {
// let classIndex = indexOfNearestCenter(p, centers: centers)
// classification[classIndex].append(p)
// }
//
// // Take the average of all the data points that belong to each centroid.
// // This moves the centroid to a new position.
// // FIXME: Whhy is this giving NaN for first result?
// let newCenters = classification.map { assignedPoints in
// assignedPoints.reduce(Vector.zero, +) / Vector(repeating: Vector.Scalar(assignedPoints.count))
// }
//
// // Find out how far each centroid moved since the last iteration. If it's
// // only a small distance, then we're done.
// centerMoveDist = Vector.Scalar(0)
// for idx in 0..<numCenters {
// centerMoveDist += distance(centers[idx], newCenters[idx]) //centers[idx].distanceTo(newCenters[idx])
// }
//
// centers = newCenters
// } while centerMoveDist > convergeDistance
//
// centroids = centers
// }
//
// func fit(_ point: Vector) -> Label {
// assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.")
//
// let centroidIndex = indexOfNearestCenter(point, centers: centroids)
// return labels[centroidIndex]
// }
//
// func fit(_ points: [Vector]) -> [Label] {
// assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.")
//
// return points.map(fit)
// }
//}
//
//// Pick k random elements from samples
//func reservoirSample<T>(_ samples: [T], k: Int) -> [T] {
// var result = [T]()
//
// // Fill the result array with first k elements
// for i in 0..<k {
// result.append(samples[i])
// }
//
// // Randomly replace elements from remaining pool
// for i in k..<samples.count {
// let j = Int(arc4random_uniform(UInt32(i + 1)))
// if j < k {
// result[j] = samples[i]
// }
// }
// return result
//}
import simd
extension SIMD where Self.Scalar: FloatingPoint {
// A default Euclidian distance for SIMD when more efficient functions aren't provided
@inlinable
public func distance(_ x: Self, _ y: Self) -> Self.Scalar {
let difference = x - y
let squared = difference * difference
return squared.sum().squareRoot()
}
@inlinable
public func distance_squared(_ x: Self, _ y: Self) -> Self.Scalar {
let difference = x - y
let squared = difference * difference
return squared.sum()
}
}
extension SIMD2<Double> {
@inlinable
public func distance(_ x: Self, _ y: Self) -> Double {
return simd.distance(x, y)
}
@inlinable
public func distance_squared(_ x: Self, _ y: Self) -> Double {
return simd.distance_squared(x, y)
}
}
extension SIMD3<Double> {
@inlinable
public func distance(_ x: Self, _ y: Self) -> Double {
return simd.distance(x, y)
}
@inlinable
public func distance_squared(_ x: Self, _ y: Self) -> Double {
return simd.distance_squared(x, y)
}
}
extension SIMD4<Double> {
@inlinable
public func distance(_ x: Self, _ y: Self) -> Double {
return simd.distance(x, y)
}
@inlinable
public func distance_squared(_ x: Self, _ y: Self) -> Double {
return simd.distance_squared(x, y)
}
}
extension SIMD2<Float> {
@inlinable
public func distance(_ x: Self, _ y: Self) -> Float {
return simd.distance(x, y)
}
@inlinable
public func distance_squared(_ x: Self, _ y: Self) -> Float {
return simd.distance_squared(x, y)
}
}
extension SIMD3<Float> {
@inlinable
public func distance(_ x: Self, _ y: Self) -> Float {
return simd.distance(x, y)
}
@inlinable
public func distance_squared(_ x: Self, _ y: Self) -> Float {
return simd.distance_squared(x, y)
}
}
extension SIMD4<Float> {
@inlinable
public func distance(_ x: Self, _ y: Self) -> Float {
return simd.distance(x, y)
}
@inlinable
public func distance_squared(_ x: Self, _ y: Self) -> Float {
return simd.distance_squared(x, y)
}
}
//import simd
//
//extension SIMD where Self.Scalar: FloatingPoint {
//
// // A default Euclidian distance for SIMD when more efficient functions aren't provided
// @inlinable
// public func distance(_ x: Self, _ y: Self) -> Self.Scalar {
// let difference = x - y
// let squared = difference * difference
// return squared.sum().squareRoot()
// }
//
// @inlinable
// public func distance_squared(_ x: Self, _ y: Self) -> Self.Scalar {
// let difference = x - y
// let squared = difference * difference
// return squared.sum()
// }
//}
//
//extension SIMD2<Double> {
//
// @inlinable
// public func distance(_ x: Self, _ y: Self) -> Double {
// return simd.distance(x, y)
// }
//
// @inlinable
// public func distance_squared(_ x: Self, _ y: Self) -> Double {
// return simd.distance_squared(x, y)
// }
//
//}
//
//extension SIMD3<Double> {
//
// @inlinable
// public func distance(_ x: Self, _ y: Self) -> Double {
// return simd.distance(x, y)
// }
//
// @inlinable
// public func distance_squared(_ x: Self, _ y: Self) -> Double {
// return simd.distance_squared(x, y)
// }
//
//}
//
//extension SIMD4<Double> {
//
// @inlinable
// public func distance(_ x: Self, _ y: Self) -> Double {
// return simd.distance(x, y)
// }
//
// @inlinable
// public func distance_squared(_ x: Self, _ y: Self) -> Double {
// return simd.distance_squared(x, y)
// }
//
//
//}
//
//extension SIMD2<Float> {
//
// @inlinable
// public func distance(_ x: Self, _ y: Self) -> Float {
// return simd.distance(x, y)
// }
//
// @inlinable
// public func distance_squared(_ x: Self, _ y: Self) -> Float {
// return simd.distance_squared(x, y)
// }
//
//}
//
//extension SIMD3<Float> {
//
// @inlinable
// public func distance(_ x: Self, _ y: Self) -> Float {
// return simd.distance(x, y)
// }
//
// @inlinable
// public func distance_squared(_ x: Self, _ y: Self) -> Float {
// return simd.distance_squared(x, y)
// }
//
//}
//
//extension SIMD4<Float> {
//
// @inlinable
// public func distance(_ x: Self, _ y: Self) -> Float {
// return simd.distance(x, y)
// }
//
// @inlinable
// public func distance_squared(_ x: Self, _ y: Self) -> Float {
// return simd.distance_squared(x, y)
// }
//}
import simd
import KDTree
extension SIMD2: KDTreePoint where Scalar: BinaryFloatingPoint {
@inlinable
public static var dimensions: Int { self.scalarCount }
@inlinable
public func kdDimension(_ dimension: Int) -> Double {
return Double(self[dimension])
}
//TODO: Confirm that Swift is pulling the "most constrained overload" when available
@inlinable
public func squaredDistance(to otherPoint: Self) -> Double {
let difference = self - otherPoint
let squared = difference * difference
return Double(squared.sum())
}
}
extension SIMD2<Double> {
@inlinable
public func squaredDistance(to otherPoint: Self) -> Double {
return simd.distance_squared(self, otherPoint)
}
}
extension SIMD2<Float> {
@inlinable
public func squaredDistance(to otherPoint: Self) -> Double {
return Double(simd.distance_squared(self, otherPoint))
}
}
//import simd
//import KDTree
//
//extension SIMD2: KDTreePoint where Scalar: BinaryFloatingPoint {
//
// @inlinable
// public static var dimensions: Int { self.scalarCount }
//
// @inlinable
// public func kdDimension(_ dimension: Int) -> Double {
// return Double(self[dimension])
// }
//
// //TODO: Confirm that Swift is pulling the "most constrained overload" when available
// @inlinable
// public func squaredDistance(to otherPoint: Self) -> Double {
// let difference = self - otherPoint
// let squared = difference * difference
// return Double(squared.sum())
// }
//}
//
//extension SIMD2<Double> {
//
// @inlinable
// public func squaredDistance(to otherPoint: Self) -> Double {
// return simd.distance_squared(self, otherPoint)
// }
//
//}
//
//extension SIMD2<Float> {
//
// @inlinable
// public func squaredDistance(to otherPoint: Self) -> Double {
// return Double(simd.distance_squared(self, otherPoint))
// }
//
//}
......@@ -39,33 +39,33 @@ final class SwiftNLPLoadDataTests: XCTestCase {
// }
func testDocumentReading() throws {
// loads all json data for test documents
let redditCommentJson = TestUtils.loadAllRedditComment()
let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
let redditComments = redditCommentJson.compactMap { TestUtils.readRedditCommentJson(json: $0) }
let redditSubmissions = redditSubmissionJson.compactMap { TestUtils.readRedditSubmissionJson(json: $0) }
// Extract body and selftext from each post, and store that for our corpus
let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } +
redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } }
// Add documents to corpus
var corpus = KeyedVectorCorpus(source: .glove6B50d)
corpus.addDocuments(documents: bodies)
//print(corpus.encodedDocuments.count)
XCTAssert(corpus.encodedDocuments.count == 28765)
// Dimensionality reduction
let truncatedCorpus = SIMDCorpus<SIMD2<Double>>(corpus)
//debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created")
// Clustering / Topic Detection
var topics = StubTopicModel<[Double]>(numberOfTopics: 3)
topics.train(truncatedCorpus)
//debugPrint(topics)
}
// func testDocumentReading() throws {
// // loads all json data for test documents
// let redditCommentJson = TestUtils.loadAllRedditComment()
// let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
//
// let redditComments = redditCommentJson.compactMap { TestUtils.readRedditCommentJson(json: $0) }
// let redditSubmissions = redditSubmissionJson.compactMap { TestUtils.readRedditSubmissionJson(json: $0) }
//
// // Extract body and selftext from each post, and store that for our corpus
// let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } +
// redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } }
//
// // Add documents to corpus
// var corpus = KeyedVectorCorpus(source: .glove6B50d)
// corpus.addDocuments(documents: bodies)
//
// //print(corpus.encodedDocuments.count)
// XCTAssert(corpus.encodedDocuments.count == 28765)
//
// // Dimensionality reduction
// let truncatedCorpus = SIMDCorpus<SIMD2<Double>>(corpus)
// //debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created")
//
// // Clustering / Topic Detection
// var topics = StubTopicModel<[Double]>(numberOfTopics: 3)
// topics.train(truncatedCorpus)
//
// //debugPrint(topics)
// }
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment