Skip to content

Instantly share code, notes, and snippets.

@Pramod-Devireddy
Last active October 18, 2022 23:26
Show Gist options
  • Save Pramod-Devireddy/7691f867d5dacf1a205b4f4e3096ed75 to your computer and use it in GitHub Desktop.
Save Pramod-Devireddy/7691f867d5dacf1a205b4f4e3096ed75 to your computer and use it in GitHub Desktop.
TollySearch

TollySearch

List of all Films of any language and year (tollysearch.herokuapp.com/{lang}/{year}) Scraping Wikipedia to get list of films of any language and year

Heroku App

Dependencies

  • gocolly
  • gorilla mux

Information from wikipedia.

package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"tollysearch/colly"
"tollysearch/mux"
)
var response string
func hello(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, "Hello Guys!")
}
func determineListenAddress() (string, error) {
port := os.Getenv("PORT")
port = "80"
if port == "" {
return "", fmt.Errorf("$PORT not set")
}
return ":" + port, nil
}
func main() {
addr, err := determineListenAddress()
if err != nil {
log.Fatal(err)
}
router := mux.NewRouter()
router.HandleFunc("/", hello)
router.HandleFunc("/{lang}/{year}", updateMoviesList)
if err := http.ListenAndServe(addr, router); err != nil {
panic(err)
}
}
func updateMoviesList(w http.ResponseWriter, r *http.Request) {
var lang, year string
lang = mux.Vars(r)["lang"]
year = mux.Vars(r)["year"]
response = ""
c := colly.NewCollector()
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.OnHTML("table.wikitable", func(e *colly.HTMLElement) {
headers := e.ChildTexts("th")
headersLength := len(headers)
e.ForEach("th", func(_ int, el *colly.HTMLElement) {
h := el.DOM.Text()
if strings.TrimSpace(strings.ToLower(h)) == "cast" {
if headersLength == 8 {
headersLength = headersLength - 1
} else if headersLength <= 5 {
headersLength = headersLength + 1
}
e.ForEach("tr", func(_ int, ele *colly.HTMLElement) {
elements := ele.ChildTexts("td")
var title string
elemsLength := len(elements)
if elemsLength == headersLength+1 {
title = elements[2]
} else if elemsLength == headersLength {
title = elements[1]
} else if elemsLength == headersLength-1 {
title = elements[0]
}
if title != "" {
response = response + title + "\n"
}
})
}
})
})
c.Visit("https://en.wikipedia.org/wiki/List_of_" + lang + "_films_of_" + year)
fmt.Fprintln(w, response)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment