List of all Films of any language and year (tollysearch.herokuapp.com/{lang}/{year}) Scraping Wikipedia to get list of films of any language and year
- gocolly
- gorilla mux
Information from wikipedia.
List of all Films of any language and year (tollysearch.herokuapp.com/{lang}/{year}) Scraping Wikipedia to get list of films of any language and year
Information from wikipedia.
package main | |
import ( | |
"fmt" | |
"log" | |
"net/http" | |
"os" | |
"strings" | |
"tollysearch/colly" | |
"tollysearch/mux" | |
) | |
var response string | |
func hello(w http.ResponseWriter, r *http.Request) { | |
fmt.Fprintln(w, "Hello Guys!") | |
} | |
func determineListenAddress() (string, error) { | |
port := os.Getenv("PORT") | |
port = "80" | |
if port == "" { | |
return "", fmt.Errorf("$PORT not set") | |
} | |
return ":" + port, nil | |
} | |
func main() { | |
addr, err := determineListenAddress() | |
if err != nil { | |
log.Fatal(err) | |
} | |
router := mux.NewRouter() | |
router.HandleFunc("/", hello) | |
router.HandleFunc("/{lang}/{year}", updateMoviesList) | |
if err := http.ListenAndServe(addr, router); err != nil { | |
panic(err) | |
} | |
} | |
func updateMoviesList(w http.ResponseWriter, r *http.Request) { | |
var lang, year string | |
lang = mux.Vars(r)["lang"] | |
year = mux.Vars(r)["year"] | |
response = "" | |
c := colly.NewCollector() | |
c.OnRequest(func(r *colly.Request) { | |
fmt.Println("Visiting", r.URL) | |
}) | |
c.OnHTML("table.wikitable", func(e *colly.HTMLElement) { | |
headers := e.ChildTexts("th") | |
headersLength := len(headers) | |
e.ForEach("th", func(_ int, el *colly.HTMLElement) { | |
h := el.DOM.Text() | |
if strings.TrimSpace(strings.ToLower(h)) == "cast" { | |
if headersLength == 8 { | |
headersLength = headersLength - 1 | |
} else if headersLength <= 5 { | |
headersLength = headersLength + 1 | |
} | |
e.ForEach("tr", func(_ int, ele *colly.HTMLElement) { | |
elements := ele.ChildTexts("td") | |
var title string | |
elemsLength := len(elements) | |
if elemsLength == headersLength+1 { | |
title = elements[2] | |
} else if elemsLength == headersLength { | |
title = elements[1] | |
} else if elemsLength == headersLength-1 { | |
title = elements[0] | |
} | |
if title != "" { | |
response = response + title + "\n" | |
} | |
}) | |
} | |
}) | |
}) | |
c.Visit("https://en.wikipedia.org/wiki/List_of_" + lang + "_films_of_" + year) | |
fmt.Fprintln(w, response) | |
} |