Я пытаюсь создать перевернутый список предложений и их положение в исходном документе, используя Python, и с треском провалился.
Допустим, у меня есть два документа:
Документ 1
I like bananas. I don't like pears.
Документ 2
I don't like heights. I like bananas.
Я пытаюсь построить указатель предложений в этих документах, который выглядит так:
Sent File[pos]
I like bananas Doc1[1], Doc2[2]
I don't like pears Doc1[2]
I don't like heights Doc2[1]
Я нашел бесчисленное количество примеров построения перевернутого списка слов и файлов, в которых они могут быть найдены, но не нашел ничего, что касалось бы построения указателя предложений.
Я пытался взломать фрагмент кода на Github, который имеет дело с традиционным индексом слов, но я явно что-то упускаю, потому что мой хак не работает.
Код, который я использую, приведен ниже. Основное отличие моего кода от упомянутого кода выше заключается в том, что я использую NLTK для токенизации документов в предложения.
Мой код выглядит следующим образом:
import nltk.data
import codecs
import os
import unicodedata
def sentence_split(text):
sent_list = []
scurrent = []
sindex = None
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for i, c in enumerate(text):
if c.isalnum():
scurrent.append(c)
sindex = i
elif scurrent:
scurrent_str = ''.join(map(str, scurrent))
sentence_prep = ''.join(tokenizer.tokenize(scurrent_str))
sentence = ''.join(sentence_prep)
sent_list.append((sindex - len(sentence) + 1, sentence))
if scurrent:
scurrent_str = ''.join(map(str, scurrent))
sentence_prep = ''.join(tokenizer.tokenize(scurrent_str))
sentence = ''.join(sentence_prep)
sent_list.append((sindex - len(sentence) + 1, sentence))
return sent_list
def sentence_normalize(sentences):
normalized_sentences = []
for index, sentence in sentences:
snormalized = sentence.lower()
normalized_sentences.append((index, snormalized))
return normalized_sentences
def sentence_index(text):
sentences = sentence_split(text)
sentences = sentence_normalize(sentences)
return sentences
def inverted_index(text):
inverted = {}
for index, sentence in sentence_index(text):
locations = inverted.setdefault(sentence, [])
locations.append(index)
return inverted
def inverted_index_add(inverted, doc_id, doc_index):
for sentence, locations in doc_index.items():
indices = inverted.setdefault(sentence, {})
indices[doc_id] = locations
return inverted
def search(inverted, query):
sentences = [sentence for _, sentence in sentence_index(query) if sentence in inverted]
results = [set(inverted[sentence].keys()) for sentence in sentences]
return reduce(lambda x, y: x & y, results) if results else []
if __name__ == '__main__':
doc1 = """
Niners head coach Mike Singletary will let Alex Smith remain his starting
quarterback, but his vote of confidence is anything but a long-term mandate.
Smith now will work on a week-to-week basis, because Singletary has voided
his year-long lease on the job.
"I think from this point on, you have to do what's best for the football team,"
Singletary said Monday, one day after threatening to bench Smith during a
27-24 loss to the visiting Eagles.
"""
doc2 = """
The fifth edition of West Coast Green, a conference focusing on "green" home
innovations and products, rolled into San Francisco's Fort Mason last week
intent, per usual, on making our living spaces more environmentally friendly
- one used-tire house at a time.
To that end, there were presentations on topics such as water efficiency and
the burgeoning future of Net Zero-rated buildings that consume no energy and
produce no carbon emissions.
"""
inverted = {}
documents = {'doc1':doc1, 'doc2':doc2}
for doc_id, text in documents.items():
doc_index = inverted_index(text)
inverted_index_add(inverted, doc_id, doc_index)
for sentence, doc_locations in inverted.items():
print (sentence, doc_locations)
queries = ['I think from this point on, you have to do whats best for the football team,"Singletary said Monday, one day after threatening to bench Smith during a 27-24 loss to the visiting Eagles']
for query in queries:
result_docs = search(inverted, query)
print("Search for '%s': %r" % (query, result_docs))
for _, sentence in sentence_index(query):
def extract_text(doc, index):
return documents[doc][index:index+20].replace('\n', ' ')
for doc in result_docs:
for index in inverted[sentence][doc]:
print (' - %s...' % extract_text(doc, index))
print
Вот фрагмент вывода:
niners {'doc1': [1]}
ninershead {'doc1': [2]}
ninersheadcoach {'doc1': [3]}
ninersheadcoachmike {'doc1': [4]}