Last active
August 24, 2023 10:56
-
-
Save mostafa6765/eb33bb701c732ac29bcbaf8bc369236a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
attrs==23.1.0 | |
Automat==22.10.0 | |
certifi==2023.5.7 | |
cffi==1.15.1 | |
charset-normalizer==3.1.0 | |
click==8.1.6 | |
constantly==15.1.0 | |
cryptography==41.0.1 | |
cssselect==1.2.0 | |
docker==6.1.3 | |
fake-useragent==1.1.3 | |
Faker==18.11.2 | |
filelock==3.12.1 | |
greenlet==2.0.2 | |
hyperlink==21.0.0 | |
idna==3.4 | |
incremental==22.10.0 | |
itemadapter==0.8.0 | |
itemloaders==1.1.0 | |
jmespath==1.0.1 | |
lxml==4.9.2 | |
packaging==23.1 | |
parsel==1.8.1 | |
playwright==1.34.0 | |
Protego==0.2.1 | |
pyasn1==0.5.0 | |
pyasn1-modules==0.3.0 | |
pycparser==2.21 | |
PyDispatcher==2.0.7 | |
pyee==9.0.4 | |
pyOpenSSL==23.2.0 | |
python-dateutil==2.8.2 | |
PyYAML==6.0.1 | |
queuelib==1.6.2 | |
requests==2.31.0 | |
requests-file==1.5.1 | |
retrying==1.3.4 | |
scrapinghub==2.4.0 | |
Scrapy==2.9.0 | |
scrapy-fake-useragent==1.4.4 | |
scrapy-playwright==0.0.26 | |
scrapy-zyte-smartproxy==2.2.0 | |
scrapyrt==0.13.0 | |
service-identity==21.1.0 | |
shub==2.14.5 | |
six==1.16.0 | |
tldextract==3.4.4 | |
toml==0.10.2 | |
tqdm==4.55.1 | |
Twisted==22.10.0 | |
typing_extensions==4.6.3 | |
urllib3==2.0.3 | |
w3lib==2.1.1 | |
websocket-client==1.6.1 | |
zope.interface==6.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrapy settings for palscraper project | |
# | |
# For simplicity, this file contains only settings considered important or | |
# commonly used. You can find more settings consulting the documentation: | |
# | |
# https://docs.scrapy.org/en/latest/topics/settings.html | |
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html | |
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html | |
BOT_NAME = "palscraper" | |
SPIDER_MODULES = ["palscraper.spiders"] | |
NEWSPIDER_MODULE = "palscraper.spiders" | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = "palscraper (+http://www.yourdomain.com)" | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = False | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
#CONCURRENT_REQUESTS = 32 | |
# Configure a delay for requests for the same website (default: 0) | |
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay | |
# See also autothrottle settings and docs | |
# DOWNLOAD_DELAY = 3 | |
# The download delay setting will honor only one of: | |
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | |
#CONCURRENT_REQUESTS_PER_IP = 16 | |
# Disable cookies (enabled by default) | |
COOKIES_ENABLED = True | |
# Disable Telnet Console (enabled by default) | |
#TELNETCONSOLE_ENABLED = False | |
# Override the default request headers: | |
#DEFAULT_REQUEST_HEADERS = { | |
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
# "Accept-Language": "en", | |
#} | |
# Enable or disable spider middlewares | |
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html | |
#SPIDER_MIDDLEWARES = { | |
# "palscraper.middlewares.PalscraperSpiderMiddleware": 543, | |
#} | |
# Enable or disable downloader middlewares | |
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html | |
#DOWNLOADER_MIDDLEWARES = { | |
# "palscraper.middlewares.PalscraperDownloaderMiddleware": 543, | |
#} | |
# Enable or disable extensions | |
# See https://docs.scrapy.org/en/latest/topics/extensions.html | |
#EXTENSIONS = { | |
# "scrapy.extensions.telnet.TelnetConsole": None, | |
#} | |
# Configure item pipelines | |
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html | |
#ITEM_PIPELINES = { | |
# "palscraper.pipelines.PalscraperPipeline": 300, | |
#} | |
# Enable and configure the AutoThrottle extension (disabled by default) | |
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html | |
#AUTOTHROTTLE_ENABLED = True | |
# The initial download delay | |
#AUTOTHROTTLE_START_DELAY = 5 | |
# The maximum download delay to be set in case of high latencies | |
#AUTOTHROTTLE_MAX_DELAY = 60 | |
# The average number of requests Scrapy should be sending in parallel to | |
# each remote server | |
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | |
# Enable showing throttling stats for every response received: | |
#AUTOTHROTTLE_DEBUG = False | |
# Enable and configure HTTP caching (disabled by default) | |
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | |
#HTTPCACHE_ENABLED = True | |
#HTTPCACHE_EXPIRATION_SECS = 0 | |
#HTTPCACHE_DIR = "httpcache" | |
#HTTPCACHE_IGNORE_HTTP_CODES = [] | |
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" | |
# Set settings whose default value is deprecated to a future-proof value | |
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" | |
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" | |
FEED_EXPORT_ENCODING = "utf-8" | |
PLAYWRIGHT_LAUNCH_OPTIONS = {"headless": True} | |
PLAYWRIGHT_BROWSER_TYPE = "webkit" | |
PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 100000 | |
DOWNLOAD_HANDLERS = { | |
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", | |
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", | |
} | |
# Enable the Zyte Smart Proxy Middleware | |
DOWNLOADER_MIDDLEWARES = { | |
'scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware': 610, | |
} | |
# Enable the Zyte Smart Proxy Manager | |
ZYTE_SMARTPROXY_ENABLED = True | |
# Set your Zyte Smart Proxy API Key | |
ZYTE_SMARTPROXY_APIKEY = 'f096****bc413f9bc2c71f6*******' | |
# Preserve the delay when using proxies | |
ZYTE_SMARTPROXY_PRESERVE_DELAY = True | |
CONCURRENT_REQUESTS = 32 | |
CONCURRENT_REQUESTS_PER_DOMAIN = 32 | |
AUTOTHROTTLE_ENABLED = False | |
DOWNLOAD_TIMEOUT = 600 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from scrapy_playwright.page import PageMethod | |
class QuotesSpider(scrapy.Spider): | |
name = 'united' | |
def start_requests(self): | |
depart_date = getattr(self,'depart_date','2023-08-25') | |
depart_from = getattr(self,'depart_from','JFK') | |
arrival_to = getattr(self,'arrival_to','DFW') | |
print(f'depart_date {depart_date}') | |
print(f'from {depart_from}') | |
print(f'to {arrival_to}') | |
headers = { | |
#'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Referer': 'https://www.united.com', | |
} | |
yield scrapy.Request( | |
headers=headers, | |
url=f"https://www.united.com/en/us/fsr/choose-flights?f={depart_from}&t={arrival_to}&d={depart_date}&tt=1&at=1&sc=7&px=1&taxng=1&newHP=True&clm=7&st=bestmatches&tqp=A", | |
meta={ | |
# 'dont_proxy': True, | |
"playwright": True, | |
"playwright_include_page": True, | |
"playwright_page_methods": [ | |
#PageMethod("wait_for_selector", '.search-form--fields-airports'), | |
] | |
}, | |
errback=self.errback_close_page, | |
) | |
async def parse(self, response): | |
# playwright instance. | |
page = response.meta["playwright_page"] | |
await page.wait_for_timeout(1000) | |
# Close login sidebar modal. | |
await page.click('button#closeBtn'); | |
# Show all flights. | |
# await page.click('.app-components-Shopping-ResultFooter-styles__buttonContainer--T6Hxj') | |
# await page.wait_for_timeout(30000) | |
flights = response.css(".app-components-Shopping-GridItem-styles__flightRow--1E4Sk") | |
for index, flight in enumerate(flights): | |
departure_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departTime--oRDUv ::text").get() | |
arrival_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalTime--1V4Lg ::text").get() | |
duration = flight.css(".app-components-Shopping-FlightInfoBlock-styles__dividerLine--2s5M8 ::text").get() | |
origin = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departAirport--1V3Dd ::text").get() | |
destination = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalAirport--2976a ::text").get() | |
flight_stop = flight.css('.app-components-Shopping-FlightBaseCard-styles__flightHeaderRight--25F4- ::text').get() | |
# price / points | |
price_economy = flight.css('[aria-describedby="MIN-ECONOMY-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get() | |
price_premium = flight.css('[aria-describedby="ECO-PREMIUM-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get() | |
price_business = flight.css('[aria-describedby="MIN-BUSINESS-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get() | |
yield { | |
"index": index, | |
"airline": "united", | |
"depart_time": departure_time, | |
"arrival_time": arrival_time, | |
"flight_duration": duration, | |
"depart_from": origin, | |
} | |
} | |
#await page.wait_for_timeout(3000) | |
# Take a screenshot. | |
#screenshot = await page.screenshot(path="./screenshots/united.png", full_page=True) | |
async def errback_close_page(self, failure): | |
page = failure.request.meta["playwright_page"] | |
await page.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment