#!/usr/bin/env python3 ''' This is a Python 3 script meant to paginate through our API and save each page (batch of 20 results) into JSON files. If the search has >5000 results, it will use the crawlDate of the last item returned in the first 5000 results as the ‘to’ date for the next 5000, and so on (which will only work if ‘sort=d’, so don’t change that aspect of the script). Note that if you are getting >5000 results in a single data leak, this will only work for the first 5000 since all docs within a specific leak have the same crawl date. It will download each page of 20 results as a .json file, and by default it will have detail=snippet, so it will only download document metadata and a snippet. Change the detail parameter to detail=full in order to get full document results (note the json file size will get much bigger). To make it work, just edit the publicKey and privateKey variables at the top with your keys (or load them from a file). There is a list of tuples in the generatePayload() function that can be modified to perform a different search. Other than the , ‘to’, ‘offset’, and ‘sort’ parameters, any of these can be changed to search for whatever you want. ''' import requests import datetime import json import time import os import hmac import hashlib import base64 host = 'api.darkowl.com' endpoint = '/api/v1/search' publicKey = 'PUBLIC KEY HERE' privateKey = 'PRIVATE KEY HERE' def main(): #starting 'to' date, set to future time for all current results payload = generatePayload('3000-01-01T00:00:00Z') #get total results totalResults, firstCrawlDate = getInitialNumbers(payload) #ask user if they want to continue if not askUser(totalResults): exit() #loop through the rest of the results paginate(totalResults, payload, 2, firstCrawlDate) ### Generates the payload ### List of tuples. 1st value in the tuple is the API parameter, second is the value def generatePayload(toDate): #variable for the domain to search. Example: ### DO NOT CHANGE THE 'SORT' OR 'TO' PARAMETERS ### DO NOT ADD 'OFFSET' PARAMETER return [ ('q', 'test'), ('similar', 'true'), ('detail', 'snippet'), ('to', toDate), ('sort', 'd'), ] ### Make initial query with offset=0 to see how many results there are, how many pages needed ### Save the first results def getInitialNumbers(payload): # get initial number, print search print('Your proposed search:') for key, value in payload: print(f'\t{key}: {value}') #generate query string search = payloadToString(payload, 0) url = f'https://{host}{endpoint}{search}' print('Getting query information...', flush=True) #make request r = requests.get(url, headers = gen_headers(endpoint + search)) #check status if r.status_code != 200: print(f'Error: Status Code {r.status_code}') print(r.url) print(r.content) exit() #save 1st 20 results as json file dumpResultstoFile(r, 1) # if <20 results, json already saved, no need to paginate if r.json()['total'] <= 20: print('Query has fewer than 20 results. Saving to file and exiting') exit() #get/set 1st crawl date firstCrawlDate = '' for result in r.json()['results']: firstCrawlDate = result['crawlDate'] break return r.json()['total'], firstCrawlDate ### Paginate through all results ### Once offset 5000 is reached, the same query is made but with the 'to' date ### being that of the last result returned by the current query def paginate(totalResults, payload, masterPage, firstCrawlDate): offset = 0 lastCrawlDate = '' if masterPage == 2: offset = 20 while offset < 5000: #generate query string search = payloadToString(payload, offset) url = f'https://{host}{endpoint}{search}' #print status print(f'Getting page {masterPage}... ', end='', flush=True) time.sleep(1) r = requests.get(url, headers = gen_headers(endpoint + search)) if r.status_code != 200: print(f'Error: Status Code {r.status_code}') print(r.url) print(r.content) exit() print('success') responseJSON = r.json() dumpResultstoFile(r, masterPage) #if fewer than 20 results came back, this is the last page if responseJSON['resultCount'] < 20: print('Done! Exiting.') exit() offset += 20 masterPage += 1 if offset == 5000: #since this is the last page for this batch of 5000, get the last crawlDate to set #as our next batch's 'to' date #crawlDate field is inclusive, so there may be 1 duplicate result per batch of 5000 for result in responseJSON['results']: lastCrawlDate = result['crawlDate'] if firstCrawlDate == lastCrawlDate: print('No change in crawl date. Exiting to avoid infinite loop.') exit() paginate(totalResults, generatePayload(lastCrawlDate), masterPage, firstCrawlDate) ### Saves the json returned by Vision API into a .json file def dumpResultstoFile(r, masterPage): if not os.path.exists('./results'): try: os.mkdir('./results') except: print("Could not create directory to save files") exit() json_data = r.json() with open(f'./results/Page_{masterPage}.json', 'w') as outFile: json.dump(json_data, outFile, indent=4) ### make sure the user is ok with the number of API calls the query will require def askUser(totalResults): numPages = ((totalResults-1) // 20) + 1 print(f'\nThis query has {totalResults} total results which will use about {numPages} additional API calls.\nDo you want to continue? (y or n)') while True: answer = input() if answer.lower() == 'y': return True elif answer.lower() == 'n': return False else: print('Please enter "y" or "n"') ### Generates request headers based on the the URL being queried def gen_headers(absPath): #get current date date = datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') # create string to hash and sign string2hash = f'GET{absPath}{date}' # convert to bytes bkey = bytes(source=privateKey, encoding='UTF-8') bpayload = bytes(source=string2hash, encoding='UTF-8') # sign with priv key hmacsha1 = hmac.new(bkey, bpayload, hashlib.sha1).digest() # convert to b64 base64encoded = base64.b64encode(hmacsha1).decode('UTF-8') # construct final auth header auth = f'OWL {publicKey}:{base64encoded}' #return headers return {'Authorization': auth, 'X-VISION-DATE': date, 'Accept': 'application/json'} ### Takes a payload (list of tuples) and generates a URL query string def payloadToString(payload, offset): search = '' count = 0 for key, value in payload: if count == 0: search += f'?{key}={value}' count = 1 else: search += f'&{key}={value}' search += f'&offset={offset}' return search if __name__ == '__main__': main()