Created
May 26, 2011 06:37
-
-
Save Swind/992655 to your computer and use it in GitHub Desktop.
[Scala][Web] 伊莉文章的Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package rainy.maid.server.seeker | |
import java.util.ArrayList | |
import rainy.maid.server.domain.EneyPost | |
import scala.xml.NodeSeq | |
import scala.xml.{ XML, Node } | |
import de.hars.scalaxml._ | |
import java.text.SimpleDateFormat | |
class eneySeeker(root: String, url: String) { | |
var nextPage = url | |
val normalThreadR = "normalthread_([0-9]{6,7})".r | |
val dateFormat = new SimpleDateFormat("yyyy-MM-dd") | |
//回傳資料型態圍List[EneyPost] 分別放著ID,文章名稱,文章超連結,最後修改日 | |
def next() = { | |
val result = new TagSoupFactoryAdapter loadString (scala.io.Source.fromURL(nextPage, "big5").mkString) | |
val tbodys = result \\ "tbody" | |
val subjects = new ArrayList[EneyPost] | |
//只取出id為normalthread的文章,因為這些不是置頂文 | |
for (tbody ← tbodys if (((tbody \ "@id").text.contains("normalthread")))) | |
{ | |
subjects add (getSubject(tbody)) | |
} | |
nextPage = root + getNextPage(result) | |
subjects | |
} | |
def getSubject(tbody: Node) = | |
{ | |
//id即為此篇文章在論壇的ID | |
//normalthread底下的第一個span存放著每篇文章的標題與超連結 | |
val titleEl = (tbody \\ "span").head | |
val normalThreadR(id) = (tbody \ "@id").text | |
val newPost = new EneyPost | |
newPost setId (id.toLong) | |
newPost setTitleName (titleEl.text) | |
newPost setURL (root + (titleEl \ "a" \ "@href")) | |
newPost setLastModifyTime (dateFormat.parse(getLastModify(tbody))) | |
newPost | |
} | |
def getLastModify(tbody: Node) = | |
{ | |
//取出Tbody底下的td Tag,並且此td Tag的Class屬性為author | |
(RichNodeSeq(tbody) \\ "td[@class==author]" \\ "em").text | |
} | |
def getNextPage(body: Node) = | |
{ | |
//找出所有Div Tag屬性Class為Pages的節點,而此Div底下的所有超連結,我們只看class為Next那個 | |
(RichNodeSeq(body) \\ "div[@class==pages]" \\ "a[@class==next]" \\ "@href").text | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package test.rainy.maid.server.maidwork | |
import org.junit.After | |
import org.junit.Before | |
import org.junit.Test | |
import com.google.appengine.tools.development.testing.LocalDatastoreServiceTestConfig | |
import com.google.appengine.tools.development.testing.LocalServiceTestHelper | |
import org.scalatest.junit.JUnitSuite | |
import scala.collection.mutable.ListBuffer | |
import rainy.maid.server.domain._ | |
import rainy.maid.server.maidwork._ | |
import scala.collection.JavaConversions._ | |
class testEneySearchNewPost extends JUnitSuite{ | |
val helper:LocalServiceTestHelper = new LocalServiceTestHelper(new LocalDatastoreServiceTestConfig()); | |
@Before | |
def setUp()= | |
{ | |
helper.setUp | |
} | |
@After | |
def tearDown()= | |
{ | |
helper.tearDown | |
} | |
@Test | |
def testSearchNewPost()= | |
{ | |
val workLog = new EneyWorkLog | |
workLog setRootPath "http://www.eyny.com/" | |
workLog setFullPath "http://www.eyny.com/forumdisplay.php?fid=22&filter=&orderby=dateline" | |
workLog setLastPostID 5506759 | |
val toDoList = new MaidToDoList with EneySearchNewPost | |
toDoList doWork(workLog) | |
val posts = EneyPostManager findPosts(false) | |
posts foreach(post=>println(post.getId,post.getTitleName)) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment