Commit 57cabd61 by Achyudh Ram

Remove unwanted files and fix directory structure

parent 645c78ec
import json
from sklearn.model_selection import train_test_split
import os
def get_review_text(review):
"""
Get a string containing the title and body of the review
:param article: A IMDB review dict
:return: String containing the review title and body
"""
title, body = "", ""
if review['title'] is not None:
title = ' '.join(review['title'].split()) + " "
if review['review'] is not None:
body = ' '.join(review['review'].split())
return title + body
def get_binary_label(label):
category_label = [0 for x in range(10)]
category_label[label - 1]
return ''.join(map(str, category_label))
def parse_documents():
"""
Extract the reviews from IMDB dataset and create train/dev/test splits
:return: Three lists containing the train, dev and test splits along with the labels
"""
with open(os.path.join("data", "reviews.json"), 'r') as json_file:
reviews = list()
for review in json.load(json_file):
reviews.append((get_binary_label(review['rating']), get_review_text(review)))
train_documents, test_documents = train_test_split(reviews, test_size=0.3, random_state=37)
train_documents, validation_documents = train_test_split(train_documents, test_size=0.28, random_state=53)
return train_documents, validation_documents, test_documents
if __name__ == "__main__":
train_documents, validation_documents, test_documents = parse_documents()
print("Train, dev and test dataset sizes:", len(train_documents), len(validation_documents), len(test_documents))
with open(os.path.join("data", "imdb_train.tsv"), 'w', encoding='utf8') as tsv_file:
for label, document in train_documents:
tsv_file.write(label + "\t" + document + "\n")
with open(os.path.join("data", "imdb_validation.tsv"), 'w', encoding='utf8') as tsv_file:
for label, document in validation_documents:
tsv_file.write(label + "\t" + document + "\n")
with open(os.path.join("data", "imdb_test.tsv"), 'w', encoding='utf8') as tsv_file:
for label, document in test_documents:
tsv_file.write(label + "\t" + document + "\n")
# Reuters-21578
A pre-processor to convert the Reuters-21578 dataset to TSV from SGM format according to the ApteMod test/train splits. This method returns the documents that belong to at least one of the categories that have at least one document in both the training and the test sets. The dataset has 90 categories with a training set of 7769 documents and a test set of 3019 documents.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from bs4 import BeautifulSoup
from topics import topic_num_map
from sklearn.model_selection import train_test_split
import os
def get_article_text(article):
"""
Get a string containing the title and body of a Reuters article object
:param article: A Reuters article object
:return: String containing the article title and body
"""
title = body = ""
if article.title is not None:
title = ' '.join(article.title.text.split()) + ". "
if article.body is not None:
body = ' '.join(article.body.text.split())
if article.title is None and article.body is None:
body = ' '.join(article.text.split())
if title.strip() + body.strip() == "":
import pdb
pdb.set_trace()
return title + body
def get_article_label(topics):
"""
Get a 90-digit binary one-hot encoded label corresponding to the given topic list
:param topics: Set of topics to which an article belongs
:return: 90-digit binary one-hot encoded label
"""
category_label = [0 for x in range(len(topic_num_map))]
for topic in topics:
if topic.text in topic_num_map:
category_label[topic_num_map[topic.text]] = 1
if sum(category_label) > 0:
return ''.join(map(str, category_label))
else:
return None
def parse_documents(validation_split=False):
"""
Extract the Reuters-90 dataset from the SGM files in data folder according to the ApteMod splits. This method
returns the documents that belong to at least one of the categories that have at least one document in both the
training and the test sets. The dataset has 90 categories with a training set of 7769 documents and a test set of
3019 documents.
:return: Two lists containing the train and test splits along with the labels
"""
train_documents = list()
test_documents = list()
for file in os.listdir('data'):
data = open(os.path.join(os.getcwd(), "data", file), 'r')
text = data.read()
data.close()
tree = BeautifulSoup(text, "html.parser")
for article in tree.find_all("reuters"):
if article.attrs['topics'] == "YES":
label = get_article_label(article.topics.children)
if label is not None:
if article.attrs['lewissplit'] == "TRAIN":
train_documents.append((label, get_article_text(article)))
elif article.attrs['lewissplit'] == "TEST":
test_documents.append((label, get_article_text(article)))
if validation_split:
train_documents, validation_documents = train_test_split(train_documents, test_size=0.25, random_state=37)
return train_documents, validation_documents, test_documents
else:
return train_documents, test_documents
def split_dataset_file(tsv_file):
num_topic_map = dict(((v,k) for k,v in topic_num_map.items()))
output_list = list()
for line in tsv_file:
label, text = line.split("\t")
output_labels = list()
for i0 in range(len(label)):
if label[i0] == '1':
output_labels.append(num_topic_map[i0])
output_list.append((output_labels, text))
return output_list
if __name__ == "__main__":
train_documents, validation_documents, test_documents = parse_documents(validation_split=True)
print("Train, test dataset sizes:", len(train_documents), len(test_documents))
with open("reuters_train.tsv", 'w', encoding='utf8') as tsv_file:
for label, document in train_documents:
tsv_file.write(label + "\t" + document + "\n")
with open("reuters_validation.tsv", 'w', encoding='utf8') as tsv_file:
for label, document in validation_documents:
tsv_file.write(label + "\t" + document + "\n")
with open("reuters_test.tsv", 'w', encoding='utf8') as tsv_file:
for label, document in test_documents:
tsv_file.write(label + "\t" + document + "\n")
\ No newline at end of file
topic_num_map = {'acq': 0, 'alum': 1, 'barley': 2, 'bop': 3, 'carcass': 4, 'castor-oil': 5, 'cocoa': 6, 'coconut': 7,
'coconut-oil': 8, 'coffee': 9, 'copper': 10, 'copra-cake': 11, 'corn': 12, 'cotton': 13,
'cotton-oil': 14, 'cpi': 15, 'cpu': 16, 'crude': 17, 'dfl': 18, 'dlr': 19, 'dmk': 20, 'earn': 21,
'fuel': 22, 'gas': 23, 'gnp': 24, 'gold': 25, 'grain': 26, 'groundnut': 27, 'groundnut-oil': 28,
'heat': 29, 'hog': 30, 'housing': 31, 'income': 32, 'instal-debt': 33, 'interest': 34, 'ipi': 35,
'iron-steel': 36, 'jet': 37, 'jobs': 38, 'l-cattle': 39, 'lead': 40, 'lei': 41, 'lin-oil': 42,
'livestock': 43, 'lumber': 44, 'meal-feed': 45, 'money-fx': 46, 'money-supply': 47, 'naphtha': 48,
'nat-gas': 49, 'nickel': 50, 'nkr': 51, 'nzdlr': 52, 'oat': 53, 'oilseed': 54, 'orange': 55,
'palladium': 56, 'palm-oil': 57, 'palmkernel': 58, 'pet-chem': 59, 'platinum': 60, 'potato': 61,
'propane': 62, 'rand': 63, 'rape-oil': 64, 'rapeseed': 65, 'reserves': 66, 'retail': 67, 'rice': 68,
'rubber': 69, 'rye': 70, 'ship': 71, 'silver': 72, 'sorghum': 73, 'soy-meal': 74, 'soy-oil': 75,
'soybean': 76, 'strategic-metal': 77, 'sugar': 78, 'sun-meal': 79, 'sun-oil': 80, 'sunseed': 81,
'tea': 82, 'tin': 83, 'trade': 84, 'veg-oil': 85, 'wheat': 86, 'wpi': 87, 'yen': 88, 'zinc': 89}
\ No newline at end of file
[Dolphin]
Timestamp=2019,4,4,18,50,38
Version=4
ViewMode=1
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment