Hey I was wondering about this function problem for a while

1039
1
03-02-2014 08:08 AM
CrombaBaaaa
New Contributor
def compare_signatures(sig1, sig2, weight):
    """ (list, list, list of float) -> float

    Return a non-negative real number indicating the similarity of the two
    linguistic signatures, sig1 and sig2. The smaller the number the more
    similar the signatures. Zero indicates identical signatures.
   
    sig1 and sig2 are 6-item lists with the following items:
    0  : Author Name (a string)
    1  : Average Word Length (float)
    2  : Type Token Ratio (float)
    3  : Hapax Legomena Ratio (float)
    4  : Average Sentence Length (float)
    5  : Average Sentence Complexity (float)

    weight is a list of multiplicative weights to apply to each
    linguistic feature. weight[0] is ignored.

    >>> sig1 = ["a_string" , 4.4, 0.1, 0.05, 10.0, 2.0]
    >>> sig2 = ["a_string2", 4.3, 0.1, 0.04, 16.0, 4.0]
    >>> weight = [0, 11.0, 33.0, 50.0, 0.4, 4.0]
    >>> compare_signatures(sig1, sig2, weight)
    12.000000000000007
    """
Tags (2)
0 Kudos
1 Reply
XanderBakker
Esri Esteemed Contributor
I don't think this has much to do with ArcGIS...

You can find more on this topic on stackoverflow:
http://stackoverflow.com/questions/19887466/program-done-but-not-running-correctly

To make the code work you should apply some changes:

import os.path, math

def main():

    mystery_filename = r'C:\Project\_Forums\compareSignatures\in\infile.txt'
    folder = r'C:\Project\_Forums\compareSignatures\signatures'
    # folder = r'C:\Project\_Forums\compareSignatures\comp'

    infile = open(mystery_filename, 'r')
    text = infile.read()
    infile.close()

    mystery_signature = [mystery_filename]
    mystery_signature.append(average_word_length(text))
    mystery_signature.append(type_token_ratio(text))
    mystery_signature.append(hapax_legomana_ratio(text))
    mystery_signature.append(average_sentence_length(text))
    mystery_signature.append(avg_sentence_complexity(text))

    weights = [0, 11, 33, 50, 0.4, 4]

    print "mystery_signature={0}".format(mystery_signature)

    # every file in this directory must be a linguistic signature
    files=os.listdir(folder)

##    # create some signature files
##    for this_file in files:
##        compfilename = os.path.join(folder, this_file)
##        compfile = open(compfilename, 'r')
##        text = compfile.read()
##        compfile.close()
##        signature = [compfilename]
##        signature.append(average_word_length(text))
##        signature.append(type_token_ratio(text))
##        signature.append(hapax_legomana_ratio(text))
##        signature.append(average_sentence_length(text))
##        signature.append(avg_sentence_complexity(text))
##
##        print ""
##        for row in signature:
##            print row


    this_file = files[0]
    signature = read_signature(os.path.join(folder,this_file))
    best_score = compare_signatures(mystery_signature, signature, weights)
    best_author = signature[0]
    for this_file in files[1:]:
        signature = read_signature(os.path.join(folder,this_file))
        score = compare_signatures(mystery_signature, signature, weights)
        if score < best_score:
            best_score = score
            best_author = signature[0]
    print( "best author match: {} with score {}".format(best_author, best_score))



def clean_up(s):
    ''' Return a version of string str in which all letters have been
    converted to lowercase and punctuation characters have been stripped
    from both ends. Inner punctuation is left untouched. '''

    punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
    result = s.lower().strip(punctuation)
    return result


def average_word_length(text):
    ''' Return the average length of all words in text. Do not
    include surrounding punctuation in words.
    text is a non-empty list of strings each ending in \n.
    At least one line in text contains a word.'''

    words = text.split()
    for word in words:
            average=sum(float(len(word)) for word in words)/float(len(words))
    return average


def type_token_ratio(text):
    ''' Return the type token ratio (TTR) for this text.
    TTR is the number of different words divided by the total number of words.
    text is a non-empty list of strings each ending in \n.
    At least one line in text contains a word. '''
    uniquewords = {}
    words=0
    for line in text.splitlines():
        line=line.strip().split()
        for word in line:
            words+=1
            if word in uniquewords:
                uniquewords[word]+=1
            else:
                uniquewords[word]=1
    TTR = float(len(uniquewords))/float(words)
    return TTR


def hapax_legomana_ratio(text):
    ''' Return the hapax_legomana ratio for this text.
    This ratio is the number of words that occur exactly once divided
    by the total number of words.
    text is a list of strings each ending in \n.
    At least one line in text contains a word.'''

    uniquewords = dict()
    words = 0
    for line in text.splitlines():
        line = line.strip().split()
        for word in line:
            words += 1
            word = word.replace(',', '').strip()
            if word in uniquewords:
                uniquewords[word] -= 1
            else:
                uniquewords[word] = 1

    unique_count = 0
    for each in uniquewords:
        if uniquewords[each] == 1:
            unique_count += 1
    HLR = float(unique_count)/float(words)
    return HLR


def split_on_separators(original, separators):
    ''' Return a list of non-empty, non-blank strings from the original string
    determined by splitting the string on any of the separators.
    separators is a string of single-character separators.'''


    result = []
    newstring=''

    for char in original:
        if char in separators:
            result.append(newstring)
            newstring=''
            if '' in result:
                result.remove('')
        else:
            newstring+=char
    return result

def average_sentence_length(text):
    ''' Return the average number of words per sentence in text.
    text is guaranteed to have at least one sentence.
    Terminating punctuation defined as !?.
    A sentence is defined as a non-empty string of non-terminating
    punctuation surrounded by terminating punctuation
    or beginning or end of file. '''
    words=0
    Sentences=0
    for line in text.split():
        words+=1
    sentence=split_on_separators(text,'?!.')
    for sep in sentence:
        Sentences+=1

    ASL = float(words) / float(Sentences)
    return ASL


def avg_sentence_complexity(text):
    '''Return the average number of phrases per sentence.
    Terminating punctuation defined as !?.
    A sentence is defined as a non-empty string of non-terminating
    punctuation surrounded by terminating punctuation
    or beginning or end of file.
    Phrases are substrings of a sentences separated by
    one or more of the following delimiters ,;: '''
    Sentences=0
    Phrases=0
    sentence=split_on_separators(text,'?!.')
    for sep in sentence:
        Sentences+=1
    Phrase=split_on_separators(text, ',;:')
    for n in Phrase:
        Phrases+=1
    ASC = float(Phrases) / float(Sentences)
    return ASC



def compare_signatures(sig1, sig2, weight):
    '''Return a non-negative real number indicating the similarity of two
    linguistic signatures. The smaller the number the more similar the
    signatures. Zero indicates identical signatures.
    sig1 and sig2 are 6 element lists with the following elements
    0  : author name (a string)
    1  : average word length (float)
    2  : TTR (float)
    3  : Hapax Legomana Ratio (float)
    4  : average sentence length (float)
    5  : average sentence complexity (float)
    weight is a list of multiplicative weights to apply to each
    linguistic feature. weight[0] is ignored.
    '''
    result = 0
    i=1
    while i <=5:
        result +=(abs(sig1-sig2))*weight
        i+=1
    return  result


def read_signature(filename):
    '''Read a linguistic signature from filename and return it as
    list of features. '''

    cmpfile = open(filename, 'r')
    # the first feature is a string so it doesn't need casting to float
    result = [cmpfile.readline()]
    # all remaining features are real numbers
    for line in cmpfile:
        result.append(float(line.strip()))
    cmpfile.close()
    return result



if __name__ == '__main__':
    main()



Kind regards,

Xander
0 Kudos