| | |
| | """ |
| | NOTE 1: Start Command starting a FastAPI on render: |
| | @see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662 |
| | uvicorn app:app --host 0.0.0.0 --port 10000 |
| | |
| | |
| | """ |
| |
|
| | import os , sys |
| | import datetime , requests , random , logging , time , timeit |
| | import simplejson as json |
| | from fastapi import FastAPI |
| | from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse |
| | |
| | from starlette.requests import Request |
| |
|
| | from bs4 import BeautifulSoup |
| | from furl import furl |
| | |
| | |
| | from pymongo import MongoClient |
| | import fire |
| | import socket |
| | import requests |
| |
|
| | from apscheduler.schedulers.background import BackgroundScheduler |
| |
|
| | HOSTNAME = socket.gethostname() |
| |
|
| | USER_AGENTS = [ |
| | "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0", |
| | "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", |
| | "Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0", |
| | "Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0", |
| | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0" |
| | ] |
| |
|
| | BOT_AGENTS = [ |
| | "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", |
| | "Googlebot/2.1 (+http://www.googlebot.com/bot.html)", |
| | "Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)", |
| | "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", |
| | "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", |
| | "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", |
| | "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" |
| | ] |
| |
|
| | |
| | MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | logging.basicConfig(level=logging.INFO , format='%(message)s') |
| | logging.getLogger("requests").setLevel(logging.ERROR) |
| |
|
| | logger = logging.getLogger(__name__) |
| | logger.setLevel(logging.DEBUG) |
| | logger.propagate=False |
| |
|
| | console_logger = logging.StreamHandler() |
| | console_logger.setLevel(logging.DEBUG) |
| | console_logger.setFormatter(logging.Formatter('%(message)s')) |
| |
|
| | logger.addHandler(console_logger) |
| |
|
| | |
| | |
| |
|
| | app = FastAPI() |
| | |
| |
|
| | port = 5000 |
| | scheduler = None |
| | proxies = {} |
| | |
| | if HOSTNAME == 'OCTOCORE': |
| | |
| | proxies = {'http': 'https://anonyland:c3c09a797abbc2458231d36c49c9b989@proxy-uk2.vpnsecure.me:8080', 'https': 'http://anonyland:c3c09a797abbc2458231d36c49c9b989@proxy-uk2.vpnsecure.me:8080'} |
| | proxy_ip = '192.168.1.43:80' |
| |
|
| | @app.get('/') |
| | def index(): |
| | |
| | logger.info(f'hostname: {HOSTNAME}') |
| | return PlainTextResponse('OK' , 200) |
| |
|
| | @app.get('/ping') |
| | def index(): |
| | return Response(status_code=200) |
| |
|
| | @app.get("/remote_ip") |
| | def remote_ip(request:Request): |
| | client_host = request.client.host |
| | return PlainTextResponse(client_host , 200) |
| |
|
| | @app.get("/task/faa_scrap_sold_listings_featured") |
| | def faa_scrap_sold_listings_featured_local(): |
| |
|
| | global proxies |
| |
|
| | timeit_request = 0 |
| | timeit_parsing = 0 |
| | timeit_mongo = 0 |
| |
|
| | response_body = '?' |
| |
|
| | if not MONGOATLAS_URI: |
| | return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500) |
| |
|
| | cnt_dbs = 4 |
| |
|
| | headers = { |
| | 'User-Agent': random.choice(USER_AGENTS) |
| | } |
| |
|
| | site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000)) |
| | r=None |
| |
|
| | try: |
| | start = time.time() |
| | r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers) |
| | timeit_request = time.time()-start |
| | except Exception as e: |
| | response_body = str(e) |
| |
|
| | if r and r.status_code==200: |
| |
|
| | try: |
| |
|
| | start = time.time() |
| | listings = parse_faa_sold_listings_page(r.text) |
| | timeit_parsing = time.time() - start |
| |
|
| | d = dict() |
| | d['date_utc'] = datetime.datetime.utcnow() |
| | d['results'] = listings |
| | d['processed']= False |
| |
|
| | status = "ok" |
| |
|
| | db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs)) |
| | col_name = 'faa_sl' |
| |
|
| | mongo_client = None |
| | try: |
| | start = time.time() |
| | mongo_client = MongoClient(MONGOATLAS_URI) |
| | db = mongo_client[db_name] |
| | col = db[col_name] |
| | r = col.insert_one(d) |
| | timeit_mongo = time.time() - start |
| | except Exception as e: |
| | status = "error saving to mongodb ({})".format(str(e)) |
| | logging.error(status) |
| | finally: |
| | try: |
| | mongo_client.close() |
| | except Exception: |
| | pass |
| |
|
| |
|
| | o = dict() |
| | o['site']="faa" |
| | o['status']=status |
| | o['date'] = d['date_utc'] |
| | o['results_count'] = len(listings) |
| | o['db_name'] = db_name |
| | o['timeit'] = {'request':timeit_request, |
| | 'parsing':timeit_parsing, |
| | 'db':timeit_mongo} |
| | |
| |
|
| | response_body = str(o) |
| |
|
| | except Exception as e: |
| | response_body = str(e) |
| |
|
| | return PlainTextResponse(response_body, 200) |
| |
|
| |
|
| | def parse_faa_sold_listings_page(html): |
| |
|
| | soup = BeautifulSoup(html , 'lxml') |
| |
|
| | listings_els = soup.find_all('div' , {'class':'productImageDiv'}) |
| |
|
| | listings = [] |
| |
|
| | for i,listing_el in enumerate(listings_els): |
| |
|
| | |
| | |
| |
|
| | l = dict() |
| |
|
| | item_url = listing_el.find('a')['href'] |
| | if not item_url.startswith('http'): |
| | item_url = 'https://fineartamerica.com/' + item_url |
| |
|
| | item_page = furl(item_url) |
| | item_page.path.normalize() |
| | l['item_page'] = item_page.url |
| |
|
| | l['image'] = listing_el.find('img' , {'class':'productImage'})['src'] |
| |
|
| | artist_url = listing_el.find('p',{'class':'artistName'}).a['href'] |
| | if not artist_url.startswith('http'): |
| | artist_url = 'https://fineartamerica.com/' + artist_url |
| | artist_page = furl(artist_url) |
| | artist_page.path.normalize() |
| | l['artist_page'] = artist_page.url |
| |
|
| | l['artist'] = listing_el.find('p',{'class':'artistName'}).text |
| | l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text |
| |
|
| | listings.append(l) |
| |
|
| | del soup |
| |
|
| | return listings |
| |
|
| | if __name__ == "__main__": |
| | import uvicorn |
| | uvicorn.run(app, host="0.0.0.0", port=7860) |