Skip to content

Instantly share code, notes, and snippets.

@Swind
Created May 26, 2011 06:37
Show Gist options
  • Save Swind/992655 to your computer and use it in GitHub Desktop.
Save Swind/992655 to your computer and use it in GitHub Desktop.
[Scala][Web] 伊莉文章的Parser
package rainy.maid.server.seeker
import java.util.ArrayList
import rainy.maid.server.domain.EneyPost
import scala.xml.NodeSeq
import scala.xml.{ XML, Node }
import de.hars.scalaxml._
import java.text.SimpleDateFormat
class eneySeeker(root: String, url: String) {
var nextPage = url
val normalThreadR = "normalthread_([0-9]{6,7})".r
val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
//回傳資料型態圍List[EneyPost] 分別放著ID,文章名稱,文章超連結,最後修改日
def next() = {
val result = new TagSoupFactoryAdapter loadString (scala.io.Source.fromURL(nextPage, "big5").mkString)
val tbodys = result \\ "tbody"
val subjects = new ArrayList[EneyPost]
//只取出id為normalthread的文章,因為這些不是置頂文
for (tbody ← tbodys if (((tbody \ "@id").text.contains("normalthread"))))
{
subjects add (getSubject(tbody))
}
nextPage = root + getNextPage(result)
subjects
}
def getSubject(tbody: Node) =
{
//id即為此篇文章在論壇的ID
//normalthread底下的第一個span存放著每篇文章的標題與超連結
val titleEl = (tbody \\ "span").head
val normalThreadR(id) = (tbody \ "@id").text
val newPost = new EneyPost
newPost setId (id.toLong)
newPost setTitleName (titleEl.text)
newPost setURL (root + (titleEl \ "a" \ "@href"))
newPost setLastModifyTime (dateFormat.parse(getLastModify(tbody)))
newPost
}
def getLastModify(tbody: Node) =
{
//取出Tbody底下的td Tag,並且此td Tag的Class屬性為author
(RichNodeSeq(tbody) \\ "td[@class==author]" \\ "em").text
}
def getNextPage(body: Node) =
{
//找出所有Div Tag屬性Class為Pages的節點,而此Div底下的所有超連結,我們只看class為Next那個
(RichNodeSeq(body) \\ "div[@class==pages]" \\ "a[@class==next]" \\ "@href").text
}
}
package test.rainy.maid.server.maidwork
import org.junit.After
import org.junit.Before
import org.junit.Test
import com.google.appengine.tools.development.testing.LocalDatastoreServiceTestConfig
import com.google.appengine.tools.development.testing.LocalServiceTestHelper
import org.scalatest.junit.JUnitSuite
import scala.collection.mutable.ListBuffer
import rainy.maid.server.domain._
import rainy.maid.server.maidwork._
import scala.collection.JavaConversions._
class testEneySearchNewPost extends JUnitSuite{
val helper:LocalServiceTestHelper = new LocalServiceTestHelper(new LocalDatastoreServiceTestConfig());
@Before
def setUp()=
{
helper.setUp
}
@After
def tearDown()=
{
helper.tearDown
}
@Test
def testSearchNewPost()=
{
val workLog = new EneyWorkLog
workLog setRootPath "http://www.eyny.com/"
workLog setFullPath "http://www.eyny.com/forumdisplay.php?fid=22&filter=&orderby=dateline"
workLog setLastPostID 5506759
val toDoList = new MaidToDoList with EneySearchNewPost
toDoList doWork(workLog)
val posts = EneyPostManager findPosts(false)
posts foreach(post=>println(post.getId,post.getTitleName))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment