format_task_2-checkpoint
FIT5196 Assessment 1
Student Name:
Student ID:
Date: 02/04/2017
Version: 2.0
Environment: Python 3.6.0 and Anaconda 4.3.0 (64-bit)
Libraries used:
collections (for calculation frequency )
re 2.2.1 (for regular expression)
os (for join path, split file name, check the file if exists)
1. Introduction
his task is to build sparse representations for the meeting transcripts generated in task 1, which includes word tokenization, vocabulary generation, and the generation of sparse representations.
2.Import libraries
In[2]:
from collections import defaultdict, Counter
from os import listdir
from os.path import isfile, join, split, exists, splitext
import re
3. Word tokenization, vocabulary generation
Structure stop word list.
Loop reading the txt file generated by task 1 for word segmentation, deleting stop words, and summing up all words.
Calculate the frequency of words and delete words that exceed the frequency of 132.
the vocabulary was sorted in alphabetic orderadd an indexsave to file.
In[2]:
# Read stop words from ./stopwords_en.txt file to construct stop word list
stopwords_file = ./stopwords_en.txt
f = open(stopwords_file)
line = f.readline()
stopwords = []
while line:
stopwords.append(line.strip())
line = f.readline()
# Define a word segmentation function, enter a segment, use regular expression word segmentation, and remove stop words
def extract_tokens(document, stopwords):
# Implement word segmentation using regular expression matching, and turn the word into lowercase
words = re.findall(w+(?:[-]w+)?, document.lower())
# remove word from stop word list
return [word for word in words if word not in stopwords]
# read the txt file generated by task 1 and generate a list with each sentence as an element
def load_txt(txt_file):
f = open(txt_file)
line = f.readline()
sentence_list = []
while line:
# remove asterisk separator
if line.strip() != **********:
sentence_list.append(line.strip())
line = f.readline()
return sentence_list
# Call load_txt, read multiple txt files, generate a list of elements inside
def batch_load_txt(onlyfiles):
meeting_transcript_list = []
for txt_file in onlyfiles:
# call load_txt function to read a txt file
meeting_transcript = load_txt(txt_file)
# fatten byextend
meeting_transcript_list.extend(meeting_transcript)
return meeting_transcript_list
# input a list of multiple paragraphs to segment each paragraph in the list
def generate_segment_tokens_list(meeting_transcript_list):
# segmentation of each paragraph, a list of words after a paragraph, as a sub-list exists in a list
meeting_transcript_list_tokens = list(map(lambda x: extract_tokens(x, stopwords), meeting_transcript_list))
# initialize a dict with a default value of 0
frequency = defaultdict(int)
# Each sublist in the circular list meeting_transcript_list_tokens
for meeting_transcript_tokens in meeting_transcript_list_tokens:
for token in meeting_transcript_tokens:
# calculate the frequency of words
frequency[token] += 1
# Use the frequency calculation results to delete words with a frequency greater than 132 in meeting_transcript_list_tokens
tokens_list = [[token for token in meeting_transcript_tokens if frequency[token] <= 132] for meeting_transcript_tokens in meeting_transcript_list_tokens]# return filter resultreturn tokens_list# Use a list containing multiple paragraphs to generate a dictionary of words, key is a word, and value is an index# Words in the vocabulary was sorted in alphabetic order# The tokens_list input is the output of generate_segment_tokens_listdef generate_token_dict(tokens_list):tokens_set = []#Take out the words from each sub-list into a large list[tokens_set.extend(tokens) for tokens in tokens_list]# Drop duplicates words and sortdistinct_sorted_token = sorted(set(tokens_set))# Create the index of wordtoken_idx = range(0, len(distinct_sorted_token))# The word index and word are combined as a key-value pair, the key is word, and the value is indexsorted_token_set = zip(distinct_sorted_token, token_idx)# Put key-value pairs into a dicttoken_dict = dict(sorted_token_set)return token_dict# Save word and word index to specified file# Input a key is a vocabulary, index is a dict of value, specify the output filedef output_vocab(vocab_dict, output_file):# Vocab_dict turns into list and sortsvocab_list = [(word, idx) for word, idx in vocab_dict.items()]vocab_list_sorted = sorted(vocab_list, key=lambda x:x[0])f = open(output_file, ‘w’)for word, idx in vocab_list_sorted:f.write(“%s:%s
” % (word,idx))f.close()In[]:txt_files_dir = ‘./txt_files’onlyfiles = [join(txt_files_dir, f) for f in listdir(txt_files_dir) if (isfile(join(txt_files_dir, f))) and (splitext(f)[1] == ‘.txt’)]meeting_transcript_list = batch_load_txt(onlyfiles)segment_tokens_list = generate_segment_tokens_list(meeting_transcript_list)token_dict = generate_token_dict(segment_tokens_list)vocab_output_file = ‘./vocab.txt’output_vocab(token_dict, vocab_output_file)4. Generate the topic boundaries encodedCycle through the task1 files generated by each task1 and do the following:1) Calculate the number of lines in the file2) Record the position of the separator3) Generate a vector of 0 length equal to the number of rows. Assign a value of 1 to the previous element at the separator position, indicating the topic boundary4) Vector to string for output to fileIn[3]:#Inter a single txt fileIoutput boundaries encoded in boolean vectorsdef generate_topic_seg(topic_txt_file):f = open(topic_txt_file, ‘r’)line = f.readline()count = 0boundaries = []# Every segments inside a looping txt filewhile line:# Asterisk does not count in the number of rowsif line.strip() != “**********”:count += 1else:# When an asterisk is encountered, it indicates that the topic’s delimitation occurs,# and the delimiter points are recorded in a listboundaries.append(count)line = f.readline()# Create a list of 0 elements with a length equal to the number of rowszero_list = [0] * count# The value of the element at the separation position is 1for i in boundaries:zero_list[i-1] = 1# Binary vector connected with commastopic_seg = “,”.join(map(str, zero_list))meeting_transcript = split(topic_txt_file)[-1].replace(“.txt”, “”)# Add file ID, output to filereturn “%s:%s” % (meeting_transcript, topic_seg)# Loop through each file to get all the vectorsdef batch_generate_topic_seg(onlyfiles):topic_seg_list = []for txt_file in onlyfiles:topic_seg_list.append(generate_topic_seg(txt_file))return topic_seg_list# Save each file’s corresponding vector into the topic_segs.txt filedef output_topic_seg(topic_seg_list, output_file):f = open(output_file, ‘w’)for topic_seg in topic_seg_list:f.write(“%s
” %topic_seg)f.close()In[]:topic_seg_output_file = ‘./topic_segs.txt’topic_seg_list = batch_generate_topic_seg(onlyfiles)output_topic_seg(topic_seg_list, topic_seg_output_file)5. Transform paragraphs to sparse representationEach txt file generated by looping task11) Divide paragraphs on the content of txt files2) Segmentation of paragraphs3) Count the frequency of occurrence of each word in the paragraph and generateas 4) All of thein paragraph are connected by a comma and output to the result fileIn[4]:# Inter a txt file and the word dictionary generated in the first stepgenerates a sparse vector for each paragraph in the txt file# indicate as word index,value as the word frequency in the paragraph appeardef sparse_txt_file(txt_file, vocab_dict):f = open(txt_file, ‘r’)# Fragment file content by asterisk separatorline = f.readline()lines = []while line:lines.append(line.strip())line = f.readline()lines.pop(-1)paragraph_list = ” “.join(lines).strip().split(“**********”)# Segment each paragraph and remove stop wordsparagraph_list_tokens = map(lambda x: extract_tokens(x, stopwords), paragraph_list)# Convert the word to get a list with the form [ indicate:value, …]paragraph_sparse_rep = map(lambda x: trans_tokens_to_index(x, vocab_dict), paragraph_list_tokens)return paragraph_sparse_rep# Use the word dictionary to transform the word into index, and at the same time, count the frequency of # occurrence of each word in the corresponding paragraph.def trans_tokens_to_index(tokens_list, vocab_dict):# Convert only words that exist in the word dictionarytokens_index_list = [vocab_dict.get(token) for token in tokens_list if token in vocab_dict]# How often each word appears in the paragraphcounts = Counter(tokens_index_list)# word as indicate, freq as value, generates a list of elements named:valuesparse_vec = [“%s:%s” % (token, freq) for token, freq in counts.items()]return “,”.join(sparse_vec)# Loops each paragraph’s corresponding sparse vector into the result filedef output_sparse_txt(paragraph_sparse_rep, output_file):f = open(output_file, ‘w’)for sparse_rep in paragraph_sparse_rep:f.write(“%s
” % sparse_rep)f.close()def batch_output_sparse_txt(txt_file_list, token_dict, output_dir):# Loop through each txt filefor topic_txt_file intxt_file_list:# Output file namesparse_rep_output_file = join(output_dir, split(topic_txt_file)[-1])# Call sparse_txt_file to extract sparse vector of a txt fileparagraph_sparse_rep = sparse_txt_file(topic_txt_file, token_dict)# Output sparse vector into result fileoutput_sparse_txt(paragraph_sparse_rep, sparse_rep_output_file)In[7]:sparse_rep_output_dir = ‘./sparse_files’batch_output_sparse_txt(onlyfiles, token_dict , sparse_rep_output_dir)
Reviews
There are no reviews yet.