# # File: kwic.py # Author: Kevin Tierney # Email: kevin@kevintierney.net # Website: http://www.kevintierney.net # # "Key Word In Context (KWIC)" program created as described on p. 32, fig. 1.3 # in Foundations of Statistical Natural Language Processing # (Manning, Schütze, 1999) # # Please use this code however you wish. # import sys import re import os.path def isInt(n): return re.match("^[-+]?[0-9]+$", n) # Appends str2 to str1 and then chops the string down to length, removing chars # from the left side. Returns the result. def concatTrim(str1, str2, length): str1 += str2 str1Len = len(str1) str1 = str1[(str1Len - length):] return str1 # # Updates bufQueue with str, trimming all buffers to the correct length # offset indicates where to begin updating buffers. This is because # if a search term is found twice in one line, the first after buffer # can end up being updated twice with the same information # def updateAfterBuffers(bufQueue, str, length, offset = 0): for i in range(offset, len(bufQueue)): bufQueue[i] = (bufQueue[i] + str)[:length] return bufQueue # Outputs the result, and then increments/returns outCount def outputResult(buf, contextSize, outCount, outLine, outBefore, searchTerm, \ afterBufs): outCount += 1 ab = afterBufs.pop(0).replace('\n', ' ') ab = ab.replace('\r', ' ') # Windows... # Pad ob if necessary ob = outBefore.pop(0).replace('\n', ' ') ob = ob.replace('\r', ' ') if len(ob) < contextSize: ob = (' ' * (contextSize - len(ob) + 1)) + ob # TODO: Make outCount and outLine stay even with eachother (store output? Make guess based on file size?) print "%2i (%2i) %s %s %s" % (outCount, outLine.pop(0), ob, \ searchTerm, ab) return outCount printHelp = True # TODO: Multiple search inputs searchTerm = '' # Word to search for in the document contextSize = 40 # Amount of context to provide on either side of the searchTerm docLoc = '' # Location of the document, '' = stdin argc = len(sys.argv) if argc >= 2 and argc <= 4: searchTerm = sys.argv[1] if argc >= 3: tmp = sys.argv[2] # If int, this is a context size. Otherwise it is the doc location if isInt(tmp): contextSize = tmp else: docLoc = tmp if argc >= 4: docLoc = sys.argv[3] else: # User needs help. print 'Usage: python kwic.py SEARCH_TERM [CONTEXT_SIZE] [DOC_LOCATION]' print '\tSEARCH_TERM\tThe term to search for in the document located at DOC_LOCATION, or in standard in.' print "\tCONTEXT_SIZE\tThe amount of context (# chars) to show before and after SEARCH_TERM. Defaults to %i." % (contextSize) print '\tDOC_LOCATION\tLocation of the text file to search through. If omitted, standard in is used.' sys.exit(0) # Validate input if not os.path.exists(docLoc): print "The path (%s) does not exist." % (docLoc) sys.exit(-1) searchTerm = searchTerm.lower() # Stores the last contextSize characters for each searchTerm encountered. beforeBuf = '' # Stores a snapshot of beforeBuf for output when afterBuf reaches contextSize outBefore = [] # Stores the line each searchTerm was foun on outLine = [] # Stores the next contextSize characters. Each searchTerm encountered pushes # a new value onto this queue (pretend its a queue) afterBufs = [] stream = None searchTermLen = len(searchTerm) lineCount = 0 outCount = 0 # Count the # of search terms outputted try: if docLoc == '': stream = sys.stdin else: stream = open(docLoc) for line in stream: lineCount += 1 termFound = False # TODO: More efficient method of case insensitive search? lowLine = line.lower() loc = lowLine.find(searchTerm) afterBufOffset = 0 while loc != -1: termFound = True # Add text before searchTerm to the buffer, cut to length beforeBuf = concatTrim(beforeBuf, line[:loc], contextSize) updateAfterBuffers(afterBufs, line[:loc], contextSize, \ afterBufOffset) afterBufOffset = len(afterBufs) partition = loc + searchTermLen # TODO: Store the found text so that the case is correct line = line[partition:] lowLine = lowLine[partition:] outBefore.append(beforeBuf) # Snapshot of the before buffer outLine.append(lineCount) afterBufs.append('') updateAfterBuffers(afterBufs, line, contextSize, afterBufOffset) afterBufOffset += 1 # Find next instance (string and searchTerm are already small case) loc = lowLine.find(searchTerm) if not termFound: beforeBuf = concatTrim(beforeBuf, line, contextSize) updateAfterBuffers(afterBufs, line, contextSize) else: # Add what is left of the line to beforeBuf beforeBuf = concatTrim(beforeBuf, line, contextSize) # Check if any buffers reached contextSize, and print out those that did for buf in afterBufs: if len(buf) == contextSize: outCount = outputResult(buf, contextSize, outCount, outLine, \ outBefore, searchTerm, afterBufs) # Clean up before exit (flush all buffers not yet filled) for i in range(0, len(afterBufs)): outCount = outputResult(buf, contextSize, outCount, outLine, \ outBefore, searchTerm, afterBufs) except IOError, (errno, strror): print "IO Error (%s): %s" % (errno, strror) finally: stream.close()