Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
Curio
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jim Wallace
Curio
Commits
2d561b2a
Commit
2d561b2a
authored
2 years ago
by
Jim Wallace
Browse files
Options
Downloads
Patches
Plain Diff
Stores EncodedDocument in Corpus. TODO: Need to clean up types across definitions/corpus/dictionary
parent
3f908e3b
No related branches found
No related tags found
No related merge requests found
Pipeline
#89053
passed
2 years ago
Stage: build
Stage: test
Changes
3
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
Sources/SwiftNLP/SNLPCorpus.swift
+5
-1
5 additions, 1 deletion
Sources/SwiftNLP/SNLPCorpus.swift
Sources/SwiftNLP/SNLPDefinitions.swift
+3
-1
3 additions, 1 deletion
Sources/SwiftNLP/SNLPDefinitions.swift
Sources/SwiftNLP/SNLPDictionary.swift
+18
-22
18 additions, 22 deletions
Sources/SwiftNLP/SNLPDictionary.swift
with
26 additions
and
24 deletions
Sources/SwiftNLP/SNLPCorpus.swift
+
5
−
1
View file @
2d561b2a
...
@@ -12,6 +12,8 @@ import NaturalLanguage
...
@@ -12,6 +12,8 @@ import NaturalLanguage
class
SNLPCorpus
{
class
SNLPCorpus
{
var
documents
:
[
Document
]
var
documents
:
[
Document
]
var
encodedDocuments
:
[[(
Int
,
Int
)]]
private
var
_dictionary
:
SNLPDictionary
<
String
,
Int
>
private
var
_dictionary
:
SNLPDictionary
<
String
,
Int
>
var
dictionary
:
SNLPDictionary
<
String
,
Int
>
{
var
dictionary
:
SNLPDictionary
<
String
,
Int
>
{
...
@@ -31,6 +33,8 @@ class SNLPCorpus {
...
@@ -31,6 +33,8 @@ class SNLPCorpus {
init
(
_
input
:
[
String
]?
=
nil
,
characterFilters
:
[
CharacterSet
]?
=
[
CharacterSet
.
punctuationCharacters
,
CharacterSet
.
decimalDigits
],
tokenFilters
:
[
Set
<
Word
>
]?
=
[
basicStopwordSet
])
{
init
(
_
input
:
[
String
]?
=
nil
,
characterFilters
:
[
CharacterSet
]?
=
[
CharacterSet
.
punctuationCharacters
,
CharacterSet
.
decimalDigits
],
tokenFilters
:
[
Set
<
Word
>
]?
=
[
basicStopwordSet
])
{
self
.
documents
=
[]
self
.
documents
=
[]
self
.
encodedDocuments
=
[]
self
.
_dictionary
=
SNLPDictionary
<
String
,
Int
>
()
self
.
_dictionary
=
SNLPDictionary
<
String
,
Int
>
()
// Create a set of Characters to filter out
// Create a set of Characters to filter out
...
@@ -68,7 +72,7 @@ class SNLPCorpus {
...
@@ -68,7 +72,7 @@ class SNLPCorpus {
func
initializeDictionary
()
{
func
initializeDictionary
()
{
self
.
dictionary
.
addDocuments
(
documents
:
documents
)
encodedDocuments
=
self
.
dictionary
.
addDocuments
(
documents
:
documents
)
}
}
...
...
This diff is collapsed.
Click to expand it.
Sources/SwiftNLP/SNLPDefinitions.swift
+
3
−
1
View file @
2d561b2a
...
@@ -11,10 +11,12 @@ public typealias Word = String
...
@@ -11,10 +11,12 @@ public typealias Word = String
public
typealias
Document
=
[
Word
]
public
typealias
Document
=
[
Word
]
public
typealias
Corpus
=
[
Document
]
public
typealias
Corpus
=
[
Document
]
public
typealias
EncodedWord
=
any
BinaryInteger
public
typealias
EncodedWord
=
(
Int
,
Int
)
public
typealias
EncodedDocument
=
[
EncodedWord
]
public
typealias
EncodedDocument
=
[
EncodedWord
]
public
typealias
EncodedCorpus
=
[
EncodedDocument
]
public
typealias
EncodedCorpus
=
[
EncodedDocument
]
public
typealias
EncodedWerd
=
BinaryInteger
&
Codable
public
typealias
Topic
=
[(
word
:
Word
,
probability
:
Double
)]
public
typealias
Topic
=
[(
word
:
Word
,
probability
:
Double
)]
public
typealias
TopicDistribution
=
[
Topic
]
public
typealias
TopicDistribution
=
[
Topic
]
...
...
This diff is collapsed.
Click to expand it.
Sources/SwiftNLP/SNLPDictionary.swift
+
18
−
22
View file @
2d561b2a
...
@@ -90,10 +90,15 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
...
@@ -90,10 +90,15 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
// >>> len(dct)
// >>> len(dct)
// 10
// 10
// """
// """
mutating
func
addDocuments
(
documents
:
[
Document
])
{
mutating
func
addDocuments
(
documents
:
[
Document
])
->
[[(
Value
,
Int
)]]
{
var
result
=
[[(
Value
,
Int
)]]()
for
document
in
documents
{
for
document
in
documents
{
documentToBagOfWords
(
document
:
document
,
allowUpdate
:
true
)
result
.
append
(
documentToBagOfWords
(
document
:
document
,
allowUpdate
:
true
)
)
}
}
return
result
}
}
...
@@ -125,37 +130,28 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
...
@@ -125,37 +130,28 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
// ([(2, 1)], {u'this': 1, u'is': 1})
// ([(2, 1)], {u'this': 1, u'is': 1})
// """
// """
mutating
func
documentToBagOfWords
(
document
:
Document
,
allowUpdate
:
Bool
=
false
,
returnMissing
:
Bool
=
false
)
{
mutating
func
documentToBagOfWords
(
document
:
Document
,
allowUpdate
:
Bool
=
false
)
->
[(
Value
,
Int
)]
{
// Construct a Dictionary containing the count of each token
// Construct a Dictionary containing the count of each token
var
counter
:
[
Key
:
Int
]
=
Dictionary
()
var
counter
:
[
Word
:
Int
]
=
Dictionary
()
for
token
in
document
{
for
token
in
document
{
counter
[
token
as!
Key
,
default
:
0
]
+=
1
counter
[
token
,
default
:
0
]
+=
1
}
}
// Fill in missing values in our Dictionary
// Fill in missing values in our Dictionary
if
allowUpdate
||
returnMissing
{
if
allowUpdate
{
let
missing
=
counter
.
filter
{
!
token2id
.
keys
.
contains
(
$0
.
key
)
}
let
missing
=
counter
.
filter
{
!
token2id
.
keys
.
contains
(
$0
.
key
as!
Key
)
}
.
sorted
{
$0
.
key
<
$1
.
key
}
.
sorted
{
$0
.
key
<
$1
.
key
}
if
allowUpdate
{
for
(
key
,
_
)
in
missing
{
for
(
key
,
_
)
in
missing
{
token2id
[
key
]
=
Value
(
token2id
.
count
)
token2id
[
key
as!
Key
]
=
Value
(
token2id
.
count
)
}
}
}
}
}
// Create a result
// Create a result
let
result
=
counter
.
compactMap
{
(
key
,
value
)
->
(
Value
,
Int
)?
in
//TODO: Confirm that ChatGPT generated this correctly
let
result
=
counter
.
compactMap
({
(
token2id
[
$0
.
key
as!
Key
]
!
,
$0
.
value
)
})
guard
let
tokenIndex
=
token2id
[
key
]
else
{
return
nil
}
return
(
Value
(
tokenIndex
),
value
)
as
(
Value
,
Int
)
}
.
reduce
(
into
:
[
Value
:
Int
]())
{
dict
,
tuple
in
dict
[
tuple
.
0
]
=
tuple
.
1
}
// Update our counters
// Update our counters
if
allowUpdate
{
if
allowUpdate
{
numDocs
+=
1
numDocs
+=
1
...
@@ -168,7 +164,7 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
...
@@ -168,7 +164,7 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
self
.
dfs
[
tokenid
,
default
:
0
]
+=
1
self
.
dfs
[
tokenid
,
default
:
0
]
+=
1
}
}
}
}
return
result
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment