#!/usr/bin/env python3
'''
This is a Python 3 script meant to paginate through our API and save each page (batch of 20 results) into JSON files.

If the search has >5000 results, it will use the crawlDate of the last item returned in the first 5000 results as the ‘to’ date for the next 5000, and so on (which will only work if ‘sort=d’, so don’t change that aspect of the script). Note that if you are getting >5000 results in a single data leak, this will only work for the first 5000 since all docs within a specific leak have the same crawl date.

It will download each page of 20 results as a .json file, and by default it will have detail=snippet, so it will only download document metadata and a snippet. Change the detail parameter to detail=full in order to get full document results (note the json file size will get much bigger).

To make it work, just edit the publicKey and privateKey variables at the top with your keys (or load them from a file). There is a list of tuples in the generatePayload() function that can be modified to perform a different search. Other than the , ‘to’, ‘offset’, and ‘sort’ parameters, any of these can be changed to search for whatever you want.
'''

import requests
import datetime
import json
import time
import os
import hmac
import hashlib
import base64

host = 'api.darkowl.com'
endpoint = '/api/v1/search'

publicKey = 'PUBLIC KEY HERE'
privateKey = 'PRIVATE KEY HERE'


def main():
    #starting 'to' date, set to future time for all current results
    payload = generatePayload('3000-01-01T00:00:00Z')

    #get total results
    totalResults, firstCrawlDate = getInitialNumbers(payload)

    #ask user if they want to continue
    if not askUser(totalResults):
        exit()

    #loop through the rest of the results
    paginate(totalResults, payload, 2, firstCrawlDate)

### Generates the payload
### List of tuples. 1st value in the tuple is the API parameter, second is the value
def generatePayload(toDate):
    #variable for the domain to search. Example:

    ###    DO NOT CHANGE THE 'SORT' OR 'TO' PARAMETERS
    ###    DO NOT ADD 'OFFSET' PARAMETER
    return [
    ('q', 'test'), ('similar', 'true'), ('detail', 'snippet'),
    ('to', toDate), ('sort', 'd'),
        ]

### Make initial query with offset=0 to see how many results there are, how many pages needed
### Save the first results
def getInitialNumbers(payload):
    # get initial number, print search
    print('Your proposed search:')
    for key, value in payload:
        print(f'\t{key}: {value}')

    #generate query string
    search = payloadToString(payload, 0)
    url = f'https://{host}{endpoint}{search}'
    print('Getting query information...', flush=True)
    #make request
    r = requests.get(url, headers = gen_headers(endpoint + search))
    #check status
    if r.status_code != 200:
        print(f'Error: Status Code {r.status_code}')
        print(r.url)
        print(r.content)
        exit()

    #save 1st 20 results as json file
    dumpResultstoFile(r, 1)

    # if <20 results, json already saved, no need to paginate
    if r.json()['total'] <= 20:
        print('Query has fewer than 20 results. Saving to file and exiting')
        exit()

    #get/set 1st crawl date
    firstCrawlDate = ''
    for result in r.json()['results']:
        firstCrawlDate = result['crawlDate']
        break
    return r.json()['total'], firstCrawlDate

### Paginate through all results
### Once offset 5000 is reached, the same query is made but with the 'to' date
### being that of the last result returned by the current query
def paginate(totalResults, payload, masterPage, firstCrawlDate):
    offset = 0
    lastCrawlDate = ''
    if masterPage == 2:
        offset = 20

    while offset < 5000:
        #generate query string
        search = payloadToString(payload, offset)
        url = f'https://{host}{endpoint}{search}'

        #print status
        print(f'Getting page {masterPage}... ', end='', flush=True)
        time.sleep(1)
        r = requests.get(url, headers = gen_headers(endpoint + search))
        if r.status_code != 200:
            print(f'Error: Status Code {r.status_code}')
            print(r.url)
            print(r.content)
            exit()
        print('success')
        responseJSON = r.json()

        dumpResultstoFile(r, masterPage)

        #if fewer than 20 results came back, this is the last page
        if responseJSON['resultCount'] < 20:
            print('Done! Exiting.')
            exit()

        offset += 20
        masterPage += 1
        if offset == 5000:
            #since this is the last page for this batch of 5000, get the last crawlDate to set
            #as our next batch's 'to' date
            #crawlDate field is inclusive, so there may be 1 duplicate result per batch of 5000
            for result in responseJSON['results']:
                lastCrawlDate = result['crawlDate']

            if firstCrawlDate == lastCrawlDate:
                print('No change in crawl date. Exiting to avoid infinite loop.')
                exit()

    paginate(totalResults, generatePayload(lastCrawlDate), masterPage, firstCrawlDate)

### Saves the json returned by Vision API into a .json file
def dumpResultstoFile(r, masterPage):
    if not os.path.exists('./results'):
        try:
            os.mkdir('./results')
        except:
            print("Could not create directory to save files")
            exit()

    json_data = r.json()

    with open(f'./results/Page_{masterPage}.json', 'w') as outFile:
        json.dump(json_data, outFile, indent=4)

### make sure the user is ok with the number of API calls the query will require
def askUser(totalResults):
    numPages = ((totalResults-1) // 20) + 1
    print(f'\nThis query has {totalResults} total results which will use about {numPages} additional API calls.\nDo you want to continue? (y or n)')

    while True:
        answer = input()
        if answer.lower() == 'y':
            return True
        elif answer.lower() == 'n':
            return False
        else:
            print('Please enter "y" or "n"')

### Generates request headers based on the the URL being queried
def gen_headers(absPath):
    #get current date
    date = datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')

     # create string to hash and sign
    string2hash = f'GET{absPath}{date}'
    # convert to bytes
    bkey = bytes(source=privateKey, encoding='UTF-8')
    bpayload = bytes(source=string2hash, encoding='UTF-8')
    # sign with priv key
    hmacsha1 = hmac.new(bkey, bpayload, hashlib.sha1).digest()
    # convert to b64
    base64encoded = base64.b64encode(hmacsha1).decode('UTF-8')
    # construct final auth header
    auth = f'OWL {publicKey}:{base64encoded}'

    #return headers
    return {'Authorization': auth, 'X-VISION-DATE': date, 'Accept': 'application/json'}

### Takes a payload (list of tuples) and generates a URL query string
def payloadToString(payload, offset):
    search = ''
    count = 0
    for key, value in payload:
        if count == 0:
            search += f'?{key}={value}'
            count = 1
        else:
            search += f'&{key}={value}'
    search += f'&offset={offset}'
    return search


if __name__ == '__main__':
   main()