diff --git a/surfer.py b/surfer.py index aff7729..060f534 100644 --- a/surfer.py +++ b/surfer.py @@ -3,19 +3,23 @@ import requests, re, copy from bs4 import BeautifulSoup from requests.auth import HTTPBasicAuth from time import sleep +from pprint import pprint +import urllib3 class surfer: - def __init__(self, base_url='', cookies=True, allow_redirects=False, credentials=None, debug=False): + def __init__(self, base_url='', with_session=True, allow_redirects=False, credentials=None, debug=False): self.base_url = base_url self.debug = debug - self.cookies = cookies + self.with_session = with_session + if with_session: self.session = requests.Session() self.allow_redirects = allow_redirects self.credentials = credentials - + # By Nike, some fucked up servers have weak SSL, and you need to cope with it despite the error throuwn by default by the SSL lib... + requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL:@SECLEVEL=1' + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def surf(self, waves, entry_wave): - self.cookiejar = None self.extracted_values = {} self.waves = waves self.next_wave = entry_wave @@ -28,138 +32,196 @@ class surfer: def _surf_wave(self): self.cur_wave = self.next_wave if self.debug: print('BASE=%s | WAVEURL=%s' %(self.base_url, self.waves[self.cur_wave]['sub_url'])) - if self.waves[self.cur_wave]['sub_url'].lower().startswith('http'): + if self.waves[self.cur_wave]['sub_url'].lower().startswith('https://') or self.waves[self.cur_wave]['sub_url'].lower().startswith('http://'): url = self.waves[self.cur_wave]['sub_url'] else: - url = self.base_url+self.waves[self.cur_wave]['sub_url'] + if self.waves[self.cur_wave]['sub_url'].startswith('/'): + url = self.base_url+self.waves[self.cur_wave]['sub_url'] + else: + url = self.base_url+'/'+self.waves[self.cur_wave]['sub_url'] self._prepare_params() self._prepare_headers() + req_meth_name = self.waves[self.cur_wave]['method'].lower() - req_meth = requests.__dict__[req_meth_name] - - if self.debug: print('Surfing to %s\n Method:%s\n Headers:%s\n Params:%s\n Cookies:%s\n' %(url, req_meth_name, self.headers, self.params, self.cookiejar)) - if self.credentials: - r = req_meth( url, - allow_redirects = self.allow_redirects, - headers = self.headers, - cookies=self.cookiejar, - params = self.params, - auth=(self.credentials['user'], self.credentials['pass']) - ) + if req_meth_name not in ('get','put', 'post', 'delete', 'head', 'options', ): return + if self.with_session : + req_meth = getattr(self.session, req_meth_name) else: - r = req_meth( url, - allow_redirects = self.allow_redirects, - headers = self.headers, - cookies=self.cookiejar, - params = self.params - ) + req_meth = getattr(requests, req_meth_name) - if r.cookies: - self.cookiejar = r.cookies - if self.debug: print('Cookies after wave %s : %s' %(self.cur_wave, self.cookiejar)) + if self.debug: print('Surfing to %s\n Method:%s\n Headers:%s\n Params:%s\n Cookies:%s\n' %(url, req_meth_name, self.headers, self.params, self.session.cookies.items())) + if self.credentials: + try: + if req_meth_name.lower() == 'post': + r = req_meth( url, + allow_redirects = self.allow_redirects, + headers = self.headers, + data = self.params, + auth=(self.credentials['user'], self.credentials['pass']), + timeout=(10, 20), + verify=False, + ) + else: + r = req_meth( url, + allow_redirects = self.allow_redirects, + headers = self.headers, + params = self.params, + auth=(self.credentials['user'], self.credentials['pass']), + timeout=(10, 20), + verify=False, + ) + except Exception as e: + if ('if_not_accessible' in self.waves[self.cur_wave]) and callable(self.waves[self.cur_wave]['if_not_accessible']): + self.waves[self.cur_wave]['if_not_accessible'](e.message, self) + self.next_wave = self.waves[self.cur_wave]['next_wave'] + return + else: + raise e + else: + try: + if req_meth_name.lower() == 'post': + r = req_meth( url, + allow_redirects = self.allow_redirects, + headers = self.headers, + data = self.params, + timeout=(10, 20), + verify=False, + ) + else: + r = req_meth( url, + allow_redirects = self.allow_redirects, + headers = self.headers, + params = self.params, + timeout=(10, 20), + verify=False, + ) + + except Exception as e: + if ('if_not_accessible' in self.waves[self.cur_wave]) and callable(self.waves[self.cur_wave]['if_not_accessible']): + self.waves[self.cur_wave]['if_not_accessible'](e.message, self) + self.next_wave = self.waves[self.cur_wave]['next_wave'] + return + else: + raise e + + if self.debug: print('Cookies after wave %s : %s' %(self.cur_wave, self.session.cookies.items())) self.next_wave = self.waves[self.cur_wave]['next_wave'] if r.status_code != 200: if self.debug: print('Error; http status=%s' %r.status_code) - if callable(self.waves[self.cur_wave]['if_not_200']): + if callable(self.waves[self.cur_wave]['if_not_200']): self.waves[self.cur_wave]['if_not_200'](r, self) else: if 'text' == self.waves[self.cur_wave]['parsing']: - if self.debug: print('Parsing text...') + if self.debug: print('Parsing TEXT...') data = r.text self.extracted_values = {} self._regexp_extract(data) elif 'json' == self.waves[self.cur_wave]['parsing']: - if self.debug: print('Parsing json...') + if self.debug: print('Parsing JSON...') data = r.json() - # ?? extracting in json has no sense to me !? + self.extracted_values = {'json_data' : data} elif 'html' == self.waves[self.cur_wave]['parsing']: + if self.debug: print('Parsing HTML...'); self.extracted_values = {} self._regexp_extract(r.text) data = BeautifulSoup(r.text, features="html.parser") self._html_xtract(data) elif 'xml' == self.waves[self.cur_wave]['parsing']: + if self.debug: print('Parsing XML...') self.extracted_values = {} self._regexp_extract(r.text) data = BeautifulSoup(r.text, 'xml') #features="xml.parser" self._xml_xtract(data) + else: + if self.debug: print('WARNING: No parsing !!') + data = r.content - - if callable(self.waves[self.cur_wave]['test_response']): + if callable(self.waves[self.cur_wave]['test_response']): test_ok = self.waves[self.cur_wave]['test_response'](data, self) + if not test_ok and self.debug: + with open('surfer_debug.log', 'w') as fil: fil.write(r.text.encode('utf-8', 'ignore')) else: test_ok = True if self.debug: print('Tested content:%s' %(('ok' if test_ok else 'not ok!'))) - if test_ok and callable(self.waves[self.cur_wave]['if_test_ok']): + if test_ok and callable(self.waves[self.cur_wave]['if_test_ok']): self.waves[self.cur_wave]['if_test_ok'](data, self) - elif (not test_ok) and callable(self.waves[self.cur_wave]['if_test_fail']): + elif (not test_ok) and callable(self.waves[self.cur_wave]['if_test_fail']): self.waves[self.cur_wave]['if_test_fail'](data, self) def _html_xtract(self, parsed_html): if 'extract' not in self.waves[self.cur_wave]: return - for needle in self.waves[self.cur_wave]['extract']: + for needle in self.waves[self.cur_wave]['extract']: if 'bs4_selectors' not in needle: continue node = parsed_html for selector in needle['bs4_selectors']: - res = node.select(selector) - if res: + res = node.select(selector) + if res: #if self.debug: print('Selector %s match ! %s' %(selector, res)) node = res[0] if self.debug: print('Selector %s found !' %(selector)) - else: + else: if self.debug: print('Selector %s NOT match !' %selector) break - - if res: + + if res: if ('bs4_getall' in needle) and needle['bs4_getall']: nodes = res extracts = [] for node in nodes: if 'bs4_extractor' in needle and callable(needle['bs4_extractor']): + if self.debug: print('[Getall] Adding node %s using external extractor...' %node) extracts.append(needle['bs4_extractor'](node)) else: - extracts.append(node.string) + if self.debug: print('[Getall] Adding node %s as string...' %node) + extracts.append(node.string) + elif 'bs4_getone' in needle: node = res[needle['bs4_getone']] if 'bs4_extractor' in needle and callable(needle['bs4_extractor']): + if self.debug: print('[GetOne] Adding node %s using external extractor...' %node) extracts = needle['bs4_extractor'](node) else: + if self.debug: print('[Getall] Adding node %s as string...' %node) extracts = node.string + else: node = res[0] if 'bs4_extractor' in needle and callable(needle['bs4_extractor']): + if self.debug: print('[default] Adding node %s using external extractor...' %node) extracts = needle['bs4_extractor'](node) else: + if self.debug: + if node.string : print('[default] Adding node %s as string...%s' %(node, node.string.encode('utf-8', 'ignore'))) + else:print('[default] Adding node %s [EMPTY]' %(node)) extracts = node.string - - if self.debug: print('HTML Needle Found : %s' %extracts) self.extracted_values[needle['name']] = extracts else: - if self.debug: print('HTML Needle %s NOT Found!' %needle['name']) + if self.debug: print('HTML Needle %s NOT Found!' %needle['name']) if needle['name'] not in self.extracted_values: self.extracted_values[needle['name']] = None if self.debug: print('HTML Extracted: %s' %self.extracted_values) def _xml_xtract(self, parsed_xml): if 'extract' not in self.waves[self.cur_wave]: return - for needle in self.waves[self.cur_wave]['extract']: + for needle in self.waves[self.cur_wave]['extract']: if 'bs4_selectors' not in needle: continue node = parsed_xml for selector in needle['bs4_selectors']: - res = node.select(selector) - if res: + res = node.select(selector) + if res: #if self.debug: print('Selector %s match ! %s' %(selector, res)) node = res[0] if self.debug: print('Selector %s found !' %(selector)) - else: + else: if self.debug: print('Selector %s NOT match !' %selector) break - - if res: + + if res: if ('bs4_getall' in needle) and needle['bs4_getall']: nodes = res extracts = [] @@ -167,7 +229,7 @@ class surfer: if 'bs4_extractor' in needle and callable(needle['bs4_extractor']): extracts.append(needle['bs4_extractor'](node)) else: - extracts.append(node.string) + extracts.append(node.string) elif 'bs4_getone' in needle: node = res[needle['bs4_getone']] if 'bs4_extractor' in needle and callable(needle['bs4_extractor']): @@ -182,10 +244,10 @@ class surfer: extracts = node.string - if self.debug: print('XML Needle Found : %s' %extracts) + if self.debug: print('XML Needle Found : %s' %extracts) self.extracted_values[needle['name']] = extracts else: - if self.debug: print('XML Needle %s NOT Found!' %needle['name']) + if self.debug: print('XML Needle %s NOT Found!' %needle['name']) if needle['name'] not in self.extracted_values: self.extracted_values[needle['name']] = None if self.debug: print('XML Extracted: %s' %self.extracted_values) @@ -211,19 +273,32 @@ class surfer: def _prepare_params(self): self.params = copy.deepcopy(self.waves[self.cur_wave]['params']) - for k, v in self.extracted_values.items(): - for par,parval in self.params.items(): + if isinstance(self.params,dict) : + for k, v in self.extracted_values.items(): + for par,parval in self.params.items(): + if isinstance(v, list) or isinstance(v, tuple): v = ' '.join(v) + self.params[par]=parval.replace('%%{%s}'%k.encode('utf-8', 'ignore'), v.encode('utf-8', 'ignore')) + elif isinstance(self.params,str) : + for k, v in self.extracted_values.items(): if isinstance(v, list) or isinstance(v, tuple): v = ' '.join(v) - self.params[par]=parval.replace('%%{%s}'%k.encode('utf-8', 'ignore'), v.encode('utf-8', 'ignore')) - # If replacement in keys is necessary do it here + self.params=self.params.replace('%%{%s}'%k.encode('utf-8', 'ignore'), v.encode('utf-8', 'ignore')) + + # If replacement in keys is necessary do it here def _prepare_headers(self): self.headers = copy.deepcopy(self.waves[self.cur_wave]['headers']) for k, v in self.extracted_values.items(): - if v == None: v='' - for par,parval in self.headers.items(): - if isinstance(v, list) or isinstance(v, tuple): v = ' '.join(v) - self.headers[par]=parval.replace('%%{%s}'%k.encode('utf-8', 'ignore'), v.encode('utf-8', 'ignore')) - # If replacement in keys is necessary do it here + if v == None: + v='' + elif isinstance(v, list) or isinstance(v, tuple): + v = ' '.join([x if x else '' for x in v]) + elif isinstance(v, bool): + v = 'True' if v else 'False' + + for par,parval in self.headers.items(): + if isinstance(v, str) or isinstance(v, unicode): + self.headers[par]=parval.replace('%%{%s}'%k.encode('utf-8', 'ignore'), v) + + # If replacement in keys is necessary do it here