I don't think this has much to do with ArcGIS...You can find more on this topic on stackoverflow:http://stackoverflow.com/questions/19887466/program-done-but-not-running-correctlyTo make the code work you should apply some changes:import os.path, math
def main():
mystery_filename = r'C:\Project\_Forums\compareSignatures\in\infile.txt'
folder = r'C:\Project\_Forums\compareSignatures\signatures'
# folder = r'C:\Project\_Forums\compareSignatures\comp'
infile = open(mystery_filename, 'r')
text = infile.read()
infile.close()
mystery_signature = [mystery_filename]
mystery_signature.append(average_word_length(text))
mystery_signature.append(type_token_ratio(text))
mystery_signature.append(hapax_legomana_ratio(text))
mystery_signature.append(average_sentence_length(text))
mystery_signature.append(avg_sentence_complexity(text))
weights = [0, 11, 33, 50, 0.4, 4]
print "mystery_signature={0}".format(mystery_signature)
# every file in this directory must be a linguistic signature
files=os.listdir(folder)
## # create some signature files
## for this_file in files:
## compfilename = os.path.join(folder, this_file)
## compfile = open(compfilename, 'r')
## text = compfile.read()
## compfile.close()
## signature = [compfilename]
## signature.append(average_word_length(text))
## signature.append(type_token_ratio(text))
## signature.append(hapax_legomana_ratio(text))
## signature.append(average_sentence_length(text))
## signature.append(avg_sentence_complexity(text))
##
## print ""
## for row in signature:
## print row
this_file = files[0]
signature = read_signature(os.path.join(folder,this_file))
best_score = compare_signatures(mystery_signature, signature, weights)
best_author = signature[0]
for this_file in files[1:]:
signature = read_signature(os.path.join(folder,this_file))
score = compare_signatures(mystery_signature, signature, weights)
if score < best_score:
best_score = score
best_author = signature[0]
print( "best author match: {} with score {}".format(best_author, best_score))
def clean_up(s):
''' Return a version of string str in which all letters have been
converted to lowercase and punctuation characters have been stripped
from both ends. Inner punctuation is left untouched. '''
punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
result = s.lower().strip(punctuation)
return result
def average_word_length(text):
''' Return the average length of all words in text. Do not
include surrounding punctuation in words.
text is a non-empty list of strings each ending in \n.
At least one line in text contains a word.'''
words = text.split()
for word in words:
average=sum(float(len(word)) for word in words)/float(len(words))
return average
def type_token_ratio(text):
''' Return the type token ratio (TTR) for this text.
TTR is the number of different words divided by the total number of words.
text is a non-empty list of strings each ending in \n.
At least one line in text contains a word. '''
uniquewords = {}
words=0
for line in text.splitlines():
line=line.strip().split()
for word in line:
words+=1
if word in uniquewords:
uniquewords[word]+=1
else:
uniquewords[word]=1
TTR = float(len(uniquewords))/float(words)
return TTR
def hapax_legomana_ratio(text):
''' Return the hapax_legomana ratio for this text.
This ratio is the number of words that occur exactly once divided
by the total number of words.
text is a list of strings each ending in \n.
At least one line in text contains a word.'''
uniquewords = dict()
words = 0
for line in text.splitlines():
line = line.strip().split()
for word in line:
words += 1
word = word.replace(',', '').strip()
if word in uniquewords:
uniquewords[word] -= 1
else:
uniquewords[word] = 1
unique_count = 0
for each in uniquewords:
if uniquewords[each] == 1:
unique_count += 1
HLR = float(unique_count)/float(words)
return HLR
def split_on_separators(original, separators):
''' Return a list of non-empty, non-blank strings from the original string
determined by splitting the string on any of the separators.
separators is a string of single-character separators.'''
result = []
newstring=''
for char in original:
if char in separators:
result.append(newstring)
newstring=''
if '' in result:
result.remove('')
else:
newstring+=char
return result
def average_sentence_length(text):
''' Return the average number of words per sentence in text.
text is guaranteed to have at least one sentence.
Terminating punctuation defined as !?.
A sentence is defined as a non-empty string of non-terminating
punctuation surrounded by terminating punctuation
or beginning or end of file. '''
words=0
Sentences=0
for line in text.split():
words+=1
sentence=split_on_separators(text,'?!.')
for sep in sentence:
Sentences+=1
ASL = float(words) / float(Sentences)
return ASL
def avg_sentence_complexity(text):
'''Return the average number of phrases per sentence.
Terminating punctuation defined as !?.
A sentence is defined as a non-empty string of non-terminating
punctuation surrounded by terminating punctuation
or beginning or end of file.
Phrases are substrings of a sentences separated by
one or more of the following delimiters ,;: '''
Sentences=0
Phrases=0
sentence=split_on_separators(text,'?!.')
for sep in sentence:
Sentences+=1
Phrase=split_on_separators(text, ',;:')
for n in Phrase:
Phrases+=1
ASC = float(Phrases) / float(Sentences)
return ASC
def compare_signatures(sig1, sig2, weight):
'''Return a non-negative real number indicating the similarity of two
linguistic signatures. The smaller the number the more similar the
signatures. Zero indicates identical signatures.
sig1 and sig2 are 6 element lists with the following elements
0 : author name (a string)
1 : average word length (float)
2 : TTR (float)
3 : Hapax Legomana Ratio (float)
4 : average sentence length (float)
5 : average sentence complexity (float)
weight is a list of multiplicative weights to apply to each
linguistic feature. weight[0] is ignored.
'''
result = 0
i=1
while i <=5:
result +=(abs(sig1-sig2))*weight
i+=1
return result
def read_signature(filename):
'''Read a linguistic signature from filename and return it as
list of features. '''
cmpfile = open(filename, 'r')
# the first feature is a string so it doesn't need casting to float
result = [cmpfile.readline()]
# all remaining features are real numbers
for line in cmpfile:
result.append(float(line.strip()))
cmpfile.close()
return result
if __name__ == '__main__':
main()
Kind regards,Xander