A little skinny full-text search tool by full inverted index

19 Mar 2017

Small file search engine using inverted index, powered by Slick &Akka&Scala-Swing Facing to a task that

Build a basic search tool which allows the user to select a folder. Then for each query (list of terms), the search tool returns all the files under the input folder that contains all these terms. and Manipulating a synonym database from init file

Output of full inverted index map list as below

entry of inverted index map format like word -> Map(doc_id -> (occurrences, index_list)

For the simple test 4 docs with the ID as the index, I just makes it simple as possible doc1.txt

new home sales top forecasts ascend ascend ascend

doc2.txt

home sales rise in july sales sales home sales rise in july sales sales

doc3.txt

increase in home sales in july

doc4.txt

july new home sales rise

  • forecast -> Map(1 -> (1,ArrayBuffer(19)))
  • sale -> Map(2 -> (6,ArrayBuffer(5, 24, 30, 41, 60, 66)), 4 -> (1,ArrayBuffer(15)), 1 -> (1,ArrayBuffer(9)), 3 -> (1,ArrayBuffer(18)))
  • rise -> Map(2 -> (2,ArrayBuffer(11, 47)), 4 -> (1,ArrayBuffer(21)))
  • top -> Map(1 -> (1,ArrayBuffer(15)))
  • in -> Map(3 -> (2,ArrayBuffer(1, 10)), 2 -> (2,ArrayBuffer(16, 52)))
  • juli -> Map(2 -> (2,ArrayBuffer(19, 55)), 4 -> (1,ArrayBuffer(1)), 3 -> (1,ArrayBuffer(27)))
  • home -> Map(2 -> (2,ArrayBuffer(0, 36)), 4 -> (1,ArrayBuffer(10)), 1 -> (1,ArrayBuffer(4)), 3 -> (1,ArrayBuffer(13)))
  • new -> Map(4 -> (1,ArrayBuffer(6)), 1 -> (1,ArrayBuffer(0)))
  • increas -> Map(3 -> (1,ArrayBuffer(1)))
  • ascend -> Map(1 -> (3,ArrayBuffer(29, 36, 43)))

Core code to build full inverted index as below

object InvertedIndexHelper extends App {

  var filePath2ID = Map[String,String]()
  var ID2FilePath = Map[String,String]()
  
  type word_occ_indexes = (String, Int, ArrayBuffer[Int])
  type invertedIndexMap = Map[String, Map[String, (Int, ArrayBuffer[Int])]]

  def buildupInvertedMap(files: Seq[String], f: (word_occ_indexes) => (word_occ_indexes)) = {
    // Map of (filePath -> one line of content)
    var fileID = 0
    val fileToContentMap = files.map { filePath =>
      val sourceFile = Source.fromFile(filePath)
      //reformat file content to one line, which makes easier to count word indexes
      val oneLine = sourceFile.getLines().mkString(" ") 
      sourceFile.close()
      //generate filePath -> id
      fileID+=1
      filePath2ID += (filePath -> fileID.toString)
      (fileID.toString -> oneLine)
    }
    //invert the map
    ID2FilePath = filePath2ID.map(_.swap)

    val unsortedResult = fileToContentMap.map {
      //convert to Iterable of (file -> (word,frequency, array of index))
      case (file, content) => (file -> lineToTuple(content).map(f(_)))
    }.foldLeft(Map.empty[String, Map[String, (Int, ArrayBuffer[Int])]]) {
      (map, file_content) => {
        file_content._2.foreach { word_freq_indexes =>
          //map add an entry which is (word -> entry of (filePath -> (frequency, Array of index)))
          val indexMapValue = 
            (map.getOrElse(word_freq_indexes._1, Map()) += 
              (file_content._1 -> (word_freq_indexes._2, word_freq_indexes._3)))
          map += (word_freq_indexes._1 -> indexMapValue)
        }
        map
      }
    }

    unsortedResult.map {case (word, listMap) =>
        //sort by word frequency
        val newListMap = ListMap(listMap.toSeq.sortBy(_._2._1): _*)
        (word -> newListMap)
      }

  }

  private def lineToTuple(line: String): Iterable[(String, Int, ArrayBuffer[Int])] = {
    val ss = Map.empty[String, (String, Int, ArrayBuffer[Int])]
    line.trim.split("\\W+").foreach { word =>
      ss(word) = multiIndexOf(line, word, ss.getOrElse(word, (word, 0, ArrayBuffer[Int]())))
    }
    ss.values
  }

  private def multiIndexOf(line: String, word: String, acc: (String, Int, ArrayBuffer[Int])) = {
    if (acc._1 == word) {
      val nextIndex = if (acc._3.isEmpty) line.indexOf(word) else line.indexOf(word, acc._3.last + word.size)
      val freq = acc._2 + 1
      acc._3.append(nextIndex)
      (word, freq, acc._3)
    } else {
      acc
    }
  }


  val prefix = "fileAbsolutePath"
  val input = List(prefix + "doc1.txt", prefix + "doc2.txt", prefix + "doc3.txt", prefix + "doc4.txt")
  println(buildupInvertedMap(input, Stemming.doStem _).mkString("\n"))

}

Unit test covered by

scalaTest & akka-testkit

class SearchActorSpec extends TestKit(ActorSystem(
  "SearchActorSpec",
  ConfigFactory.parseString(SearchActorSpec.config)))
  with DefaultTimeout with ImplicitSender
  with WordSpecLike with Matchers with BeforeAndAfterAll{

  val searchActor = system.actorOf(Props[SearchActor],name="searchActor")
  val searchFile = new File(getClass.getResource("/doc4.txt").getFile)

  val searchActorRef = TestActorRef(new SearchActor)

  override def afterAll(): Unit = shutdown()

  "SearchActor" should {
    "respond to the SearchMessage " in {
      within(500 millis) {
        searchActor ! SearchMessage(searchFile.getParentFile,"forecast")
        expectMsgPF(500 millis){
          case msg @ (a:List[String],b:String) =>
            a should have size 1
            b should not be empty
        }
      }
    }

    "throw exception" in {
      intercept[Exception](searchActorRef.receive("error"))
    }
  }
}


object SearchActorSpec {
  val config =
    """
      |akka {
      |   loglevel = "WARNING"
      |}
    """.stripMargin
}

Stemming the words by

The Porter Stemming Algorithm

Reference to inverted index

elastic - basic concepts URL A first take at building an inverted index Full inverted index