from BPTree import *

class WebSearch(object):
	def __init__(self):
		self._index=BPTree(10)
		self._documents=[]

	def get_document_names(self, numbers):
		''' the results of a query are document numbers, translate them back to document names '''
		results=[]
		for number in numbers:
			results.append(self._documents[number])
		return results

	def get_next_document_number(self):
		return len(self._documents)

	def _clean(self, document):
		''' clean up a document and split it into into keywords 
		accept only alphabetic strings, strip away all non a-z characters
		return only keywords of length 3 or more
		'''

		document=document.lower()
		for keyword in document.split():
			if len(keyword)>2:
				yield keyword

	def load(self, document, document_name):
		'''  add document to the index '''
		document_number=len(self._documents)
		self._documents.append(document_name)
		for keyword in self._clean(document):
			self._index.insert((keyword, document_number))

	def search(self, keyword):
		''' return a 'list' of documents containing keyword'''
		return self._index.search(keyword)

	def intersect(self, L):
		''' return the intersection of the lists of documents in L '''
		# re-write this code yourself, we are yielding them in order
		if L==[]:
			return []
		else:
			s=None
			for r in L:
				if s is None:
					s=set(r)
				else:
					s=s.intersection(r)
			return(sorted(list(s)))

	def union(self, L):
		''' return the union of the lists of documents in L '''
		# re-write this code yourself, we are yielding them in order
		s=set()
		for r in L:
			s=s.union(r)
		return(sorted(list(s)))


if __name__=='__main__':

	ws=WebSearch()
	ws.load("this is a test", "firstDocument")
	ws.load("here is something completely new", "secondDocument")
	ws.load("quality books all our ebooks were previously published by bona fide publishers.", "thirdDocument")
	ws.load("We digitized and diligently proofread them with the help of thousands of volunteers", "fourthDocument")
	ws.load("someone digitized the proofread document to help the volunteers", "fifthDocument")
	ws.load("All but four stories are narrated by Holmess friend and biographer", "http://en.wikipedia.org/wiki/Sherlock_Holmes")
	print(ws._index.keys())
	print(list(ws.search("the")))
	print(ws.get_document_names(ws.search("the")))
	print(ws.get_document_names(ws.search("proofread")))


	print(list(ws.search("someone")))
	print(list(ws.search("four")))
	print(list(ws.search("friend")))

	query1=[ws.search("someone"), ws.search("four"), ws.search("friend")]
	print(ws.get_document_names(ws.intersect(query1)))
	query2=[ws.search("hi"), ws.search("there"), ws.search("friend")]
	ws.intersect([ws.union(query1), ws.intersect(query2)])
	# print(ws.get_document_names(ws.intersect(query)))
