class Test extends FlatSpec with Matchers { "Table Of Content extractor" should "download and extract content from Oxford Site" in { val content:List[String] = OxfordSite.getTableOfContent content.size should be (10) content.find(_ == "AB") should be (Some("AB")) content.find(_ == "UZ") should be (Some("UZ")) } "Words list extractor" should "download words from page" in { val future: Future[Try[Option[List[String]]]] = OxfordSite.getWordsFromPage("AB", 1) val wordsTry:Try[Option[List[String]]] = Await.result(future,60 seconds) wordsTry should be a 'success val words = wordsTry.get words.get.find(_ == "abandon") should be (Some("abandon")) } "Words list extractor" should "return None from empty page" in { val future: Future[Try[Option[List[String]]]] = OxfordSite.getWordsFromPage("AB", 999) val wordsTry:Try[Option[List[String]]] = Await.result(future,60 seconds) wordsTry should be a 'success val words = wordsTry.get words should be(None) } "Russian Translation" should "download translation and parse" in { val page: Future[Try[String]] = LingvoSite.getPage("test") val pageResultTry: Try[String]= Await.result(page,60 seconds) pageResultTry should be a 'success val pageResult = pageResultTry.get pageResult.contains("") should be(true) LingvoSite.parseTranslation(pageResult).get should be("") } "English Translation" should "download translation and parse" in { val page: Future[Try[String]] = OxfordSite.getPage("test") val pageResultTry: Try[String] = Await.result(page,60 seconds) pageResultTry should be a 'success val pageResult = pageResultTry.get pageResult.contains("examination") should be(true) OxfordSite.parseTranslation(pageResult).get should be(("test", "an examination of somebody's knowledge or ability, consisting of questions for them to answer or activities for them to perform")) } }
object Top3000WordsApp extends App { val system = ActorSystem("Top3000Words") val dictionatyActor = system.actorOf(Props[DictionaryActor], "dictionatyActor") val englishTranslationActor = system.actorOf(Props(classOf[EnglishTranslationActor], dictionatyActor), "englishTranslationActor") val russianTranslationActor = system.actorOf(Props(classOf[RussianTranslationActor], dictionatyActor), "russianTranslationActor") val mapGetPageThreadExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(16)) val mapGetWordsThreadExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(16)) start() scala.io.StdIn.readLine() system.terminate() def start() = { import concurrent.ExecutionContext.Implicits.global Future { OxfordSite.getTableOfContent.par.foreach(letterGroup => { getWords(letterGroup, 1) }) } } def getWords(letterGroup: String, pageNum: Int): Unit = { implicit val executor = mapGetWordsThreadExecutionContext OxfordSite.getWordsFromPage(letterGroup, pageNum).map(tryWords => { tryWords match { case Success(Some(words)) => words.par.foreach(word => { parse(word,letterGroup,pageNum) }) case Success(None) => Unit case Failure(ex) => println(ex.getMessage) } }) } def parse(word: String, letterGroup: String, pageNum: Int)= { implicit val executor = mapGetPageThreadExecutionContext OxfordSite.getPage(word).map(tryEnglishPage => { tryEnglishPage match { case Success(englishPage) => { englishTranslationActor ! (word, englishPage) getWords(letterGroup, pageNum + 1) } case Failure(ex) => println(ex.getMessage) } }) LingvoSite.getPage(word).map(_ match { case Success(russianPage) => { russianTranslationActor !(word, russianPage) } case Failure(ex) => println(ex.getMessage) }) } }
object OxfordSite { val getPageThreadExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(16)) def parseTranslation(content: String): Try[(String, String)] = { Try { val browser = new Browser val doc = browser.parseString(content) val spanElement: Element = doc >> element(".phon") val str = Jsoup.parse(spanElement.toString).text() val transcription = str.stripPrefix("BrE//").stripSuffix("//").trim val translation = doc >> text(".def") (transcription,translation) } } def getPage(word: String): Future[Try[String]] = { implicit val executor = getPageThreadExecutionContext Future { Try { val html = Source.fromURL("http://www.oxfordlearnersdictionaries.com/definition/english/" + (word.replace(' ','-')) + "_1") html.mkString } } } def getWordsFromPage(letterGroup: String, pageNum: Int): Future[Try[Option[List[String]]]] = { import ExecutionContext.Implicits.global Future { Try { val html = Source.fromURL("http://www.oxfordlearnersdictionaries.com" + "/wordlist/english/oxford3000/Oxford3000_" + letterGroup + "/?page=" + pageNum) val page = html.mkString val browser = new Browser val doc = browser.parseString(page) val ulElement: Element = doc >> element(".wordlist-oxford3000") val liElements: List[Element] = ulElement >> elementList("li") if (liElements.size > 0) Some(liElements.map(_ >> text("a"))) else None } } } def getTableOfContent: List[String] = { val html = Source.fromURL("http://www.oxfordlearnersdictionaries.com/wordlist/english/oxford3000/Oxford3000_A-B/") val page = html.mkString val browser = new Browser val doc = browser.parseString(page) val ulElement: Element = doc >> element(".hide_phone") val liElements: List[Element] = ulElement >> elementList("li") List(liElements.head >> text("span")) ++ liElements.tail.map(_ >> text("a")) } } object LingvoSite { val getPageThreadExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(16)) def parseTranslation(content: String): Try[String] = { Try { val browser = new Browser val doc = browser.parseString(content) val spanElement: Element = doc >> element(".r_rs") spanElement >> text("a") } } def getPage(word: String): Future[Try[String]] = { implicit val executor = getPageThreadExecutionContext Future { Try { val html = Source.fromURL("http://www.translate.ru/dictionary/en-ru/" + java.net.URLEncoder.encode(word,"UTF-8")) html.mkString } } } }
case class Word (word: String, transcription: Option[String] = None, russianTranslation:Option[String] = None, englishTranslation: Option[String] = None) case class RussianTranslation(word:String, translation: String) case class EnglishTranslation(word:String, translation: String) case class Transcription(word:String, transcription: String)
class EnglishTranslationActor (dictionaryActor: ActorRef) extends Actor { println("EnglishTranslationActor") def receive = { case (word: String, englishPage: String) => { OxfordSite.parseTranslation(englishPage) match { case Success((transcription, translation)) => { dictionaryActor ! EnglishTranslation(word,translation) dictionaryActor ! Transcription(word,transcription) } case Failure(ex) => { println(ex.getMessage) } } } } } class RussianTranslationActor (dictionaryActor: ActorRef) extends Actor { println("RussianTranslationActor") def receive = { case (word: String, russianPage: String) => { LingvoSite.parseTranslation(russianPage) match { case Success(translation) => { dictionaryActor ! RussianTranslation(word, translation) } case Failure(ex) => { println(ex.getMessage) } } } } }
class DictionaryActor extends Actor { println("DictionaryActor") override def postStop(): Unit = { println("DictionaryActor postStop") val fileText = DictionaryActor.words.map{case (_, someWord)=> { val transcription = someWord.transcription.getOrElse(" ") val russianTranslation = someWord.russianTranslation.getOrElse(" ") val englishTranslation = someWord.englishTranslation.getOrElse(" ") List(someWord.word, transcription , russianTranslation , englishTranslation).mkString("|") }}.mkString("\n") scala.tools.nsc.io.File("dictionary.txt").writeAll(fileText) println("dictionary.txt saved") System.exit(0) } def receive = { case Transcription(wordName, transcription) => { val newElement = DictionaryActor.words.get(wordName) match { case Some(word) => word.copy(transcription = Some(transcription)) case None => Word(wordName,transcription = Some(transcription)) } DictionaryActor.words += wordName -> newElement println(newElement) } case RussianTranslation(wordName, translation) => { val newElement = DictionaryActor.words.get(wordName) match { case Some(word) => word.copy(russianTranslation = Some(translation)) case None => Word(wordName,russianTranslation = Some(translation)) } DictionaryActor.words += wordName -> newElement println(newElement) } case EnglishTranslation(wordName, translation) => { val newElement = DictionaryActor.words.get(wordName) match { case Some(word) => word.copy(englishTranslation = Some(translation)) case None => Word(wordName,englishTranslation = Some(translation)) } DictionaryActor.words += wordName -> newElement println(newElement) } } } object DictionaryActor { var words = scala.collection.mutable.Map[String, Word]() }
bash-3.2$ cat ./dictionary.txt |wc -l 1809
Source: https://habr.com/ru/post/273431/
All Articles