# -*- coding: UTF-8 -*-
Universitt Tbingen Seminar fr Sprachwissenschaft
VL Programming and Data Analysis WS 2019-2020
Johannes Dellert, Gerhard Jger
Assignment 07: Analyzing the Spanish Copulas
Template
# all inflected forms of ser
conj_ser = {sersiendo, sido, sidasidos, sidas, soy, eres, sos, es, somos, sois, son, era,
eras, era, ramos, erais, eran, fui, fuiste, fue, fuimos, fuisteis, fueron, ser,
sers, ser, seremos, seris, sern, sera, seras, sera, seramos, serais, seran,
sea, seas, sea, seamos, seis, sean, fuera, fueras, fuera, furamos, fuerais,
fueran, fuese, fueses, fuese, fusemos, fueseis, fuesen, fuere, fueres, fuere,
furemos, fuereis, fueren, s, sea, seamos, sed, sean}
# all inflected forms of estar
conj_estar = {estar, estando, estado, estada, estados, estadas, estoy, ests, est, estamos,
estis, estn, estaba, estabas, estaba, estbamos, estabais, estaban, estuve,
estuviste, estuvo, estuvimos, estuvisteis, estuvieron, estar, estars, estar,
estaremos, estaris, estarn, estara, estaras, estara, estaramos, estarais,
estaran, est, ests, est, estemos, estis, estn, estuviera, estuvieras, estuviera,
estuviramos, estuvierais, estuvieran,estuviese, estuvieses, estuviese, estuvisemos,
estuvieseis, estuviesen, estuviere, estuvieres, estuviere, estuviremos, estuviereis,
estuvieren, est, est, estemos, estad, estn}
# Task 1
def load_sentences(filename):
Load sentences with POS tag from file.
:param filename: name of the file with the sentences, one sentence per line.
:type filename: str
:return: list of the sentences from the file, each represented by a list of (form,pos) tuples
:rtype: list[list[tuple[str, str]]]
with open(filename, encoding=utf-8) as f: # ensure file gets closed and exceptions handled
sentences_list = [] # create list to be returned
for line in f.readlines():
l = line.strip().split( ) #get rid of newline symbol and split at space to get separate words
words = [tuple(item.split(_)) for item in l] # split words further into word and tag. turn the list into tuple for correct output
sentences_list.append(words)
return sentences_list
# Task 2
def lemmatize(adj):
Naively lemmatize Spanish adjectives.
:param adj: the adjective to be lemmatized
:type adj: str
:return: the lemma form of the adjective
:rtype: str
if adj.endswith(esa):
return adj[:-3]+s
elif adj.endswith(a):
return adj[:-1]+o
elif adj.endswith(esas) or adj.endswith(eses):
return adj[:-4]+s
elif adj.endswith(as) or adj.endswith(os):
return adj[:-2]+o
elif adj.endswith(ntes) or adj.endswith(nses) or adj.endswith(bles) or adj.endswith(bres):
return adj[:-1]
elif adj.endswith(les) or adj.endswith(res) or adj.endswith(nes):
return adj[:-2]
elif adj.endswith(ces):
return adj[:-3]+z
elif adj.endswith(es): # need to check distinctive cases ending with -es first so theyre not overwritten by this
return adj[:-1]
else:
return adj
# Task 3
def count_occurrences(sentences):
Count occurrences of adjectives as complements to forms of ser and estar.
:param sentences: a list of sentences, each represented by a list of (form,pos) tuples
:type sentences: list[list[tuple[str, str]]]
:return: two dictionaries storing the counts of adjective lemmas following forms of ser and estar in the sentences
:rtype: tuple[dict[str, int], dict[str, int]]
# create the dictionaries to return
freq_ser = dict()
freq_estar = dict()
for sentence in sentences: # go through all sentences
for word in sentence: # go through each item of sentences
if len(word) == 2 and word[1] == ADJ: # if the word is a 2 length tuple and adjective
#print(word[0])
aux = sentence[sentence.index(word)-1] # check position before adjective
if len(aux) == 2 and aux[1] == AUX: # check if it is a 2 length tuple and aux
lemma = lemmatize(word[0]) # lemmatize words
if not lemma in freq_ser:
freq_ser[lemma] = 0 # if the lemma isnt in dict yet assign occurence 0
if aux[0] in conj_ser: # otherwise add 1 to frequency
freq_ser[lemma] += 1
if not lemma in freq_estar:
freq_estar[lemma] = 0
if aux[0] in conj_estar:
freq_estar[lemma] += 1
with open(test2.txt, w, encoding=utf8) as f:
for frequency in freq_estar:
f.write(frequency +
)
return (freq_ser,freq_estar)
# Task 4
def get_occurrence_sets(freq_ser, freq_estar):
Extract a partition of adjectives by well-attested co-occurrence with ser and estar.
:param freq_ser: a dictionary mapping
adjective lemmas into the number of times each adjective occurred after forms of ser
:type freq_ser: dict[str, int]
:param freq_estar: a dictionary mapping
adjective lemmas into the number of times each adjective occurred after forms of ser
:type freq_estar: dict[str, int]
:return: Tuple of three sets (ser, estar, both)
partitioning the adjectives with frequency >= 10 into the copulas they are attested with more than once.
:rtype: tuple[set[str], set[str], set[str]]
ser_adj = set()
estar_adj = set()
both_adj = set()
for item in freq_ser:
if freq_ser[item]+freq_estar[item] > 9:
if freq_ser[item] > 1 and freq_estar[item] < 2:ser_adj.add(item)if freq_estar[item] > 1 and freq_ser[item] < 2:estar_adj.add((item))if freq_estar[item] >1 and freq_ser[item] >1:
both_adj.add(item)
else:
pass
#for item in freq_estar:
#print(item)
return (ser_adj, estar_adj, both_adj)
# Task 5 (Bonus)
def search_bibliography(target, contents):
Search a LaTeX bibliogaphy for targets using regular expressions.
:param target: the target so search for: titles, authors, years,pages, colltitles,
or the transformation to perform: transform_headings, transform_names
:type target: str
:param contents: the contents of the biliography
:type contents: str
:return: a list of matching substrings
:rtype: list[str]
import regex as re
if not target.startswith(transform_):
pattern =
if target == titles:
# short (w/o comments):
# pattern = r).([ws#:,;-]+).
# long version (w comments):
pattern = r
).s# paper title begins after ). of year information
( # beginning of the one group were interested in
[ws+:,;-]+# title is concat. of word characters, whitespace and some punct. symbols
) # end of the one group were interested in
.# paper title ends before .
elif target == authors:
# short (w/o comments):
# pattern = r\bibitem{w*} s(.+)s(
# long version (w comments):
pattern = r
\bibitem{w*}s# author begins after bibitem{key} (where key is concat. of word ch.)
( # beginning of the one group were interested in
.+# author is concatenation of any symbols, at least one
) # end of the one group were interested in
s(# author ends before ( of year information
elif target == years:
pattern = rd{4}# TODO fill in the regular expression between the double quotes
elif target == pages:
pattern = rd+–d+# TODO fill in the regular expression between the double quotes
elif target == colltitles:
pattern = rIns{\its(.*)}# TODO fill in the regular expression between the double quotes
return re.findall(pattern, contents, re.X)
else:
pattern =
substitute =
if target == transform_headings:
# short vesion (w/o comments):
pattern = r(\bibitem{w+})
# long version (w comments):
# pattern = r
# ( # beginning of the 1st group (= heading)
# \bibitem{w+}# bibitem heading is bibitem{key} (where key is a concat. of word ch.s)
# ) # end of the 1st group (= heading)
# s# after 1st group (= heading) comes a whitespacse
#
substitute = r
1
# insert a newline symbol, then the 1st group (= the heading), then another
elif target == transform_names:
# short vesion (w/o comments):
pattern = r(w+),(s)([A-Z].)# TODO fill in the regular expression between the double quotes
substitute = r321# TODO fill in the regular expression substitution string between the double quotes
return re.sub(pattern, substitute, contents, re.X)
if __name__ == __main__:
sentences = load_sentences(spanish_tagged_spacy.txt)# load the tagged sentence corpus
#print(sentences[0:10])
#count_occurrences(sentences)
freq_ser, freq_estar = count_occurrences(sentences) # count the occurrences of adjectives with each copula
#print(get_occurrence_sets(freq_ser,freq_estar))
ser , estar , both = get_occurrence_sets ( freq_ser , freq_estar )
print(sorted(estar)[:5])
#ser, estar, both = get_occurrence_sets(freq_ser, freq_estar)# get the co-occurrence sets of adjs. for each copula
#print(ser:
====
+
.join(sorted(ser)) +
) # print the adjectives that only go with ser
#print(estar:
====
+
.join(sorted(estar)) +
) # print the adjectives that only go with estar
#print(both:
====
+
.join(sorted(both)) +
) # print the adjectives that can go with both copulas
# Bonus task
with open(bibliography.tex) as bib_file:
file_contents = bib_file.read()
print(search_bibliography(titles, file_contents))
print(search_bibliography(authors, file_contents))
print(search_bibliography(years, file_contents))
print(search_bibliography(pages, file_contents))
print(search_bibliography(colltitles, file_contents))
print(search_bibliography(transform_headings, file_contents))
print(search_bibliography(transform_names, file_contents))
Reviews
There are no reviews yet.