Well, hmmm, several Mac users and at least one Windows user with the new error. We need to be able to see exactly what is happening between clicking the button and allegedly getting something back from Yahoo.
I've written a new getHtml(), heavily leaning on the old one, which collects all the gubbins like HTTP response code and server header and exception codes, into a data structure that can be queried and used to produce messages. It's not quite a drop-in replacement, as the function is now a method on an object, and the object collects all the gory details.
I've tried it inside a modified copy of HYPTUSS on just the Yahoo fetches, and dumped useful text into yuuuuge message boxes, so it is possible to gather clues even without the logging, though that works too. This is one way you might use it:
Code: Select all
webAgent = WebAgent()
...
html = webAgent.fetch(url)
if webAgent.fetch_ok():
print(html)
else:
print( "Diagnostics:\n" + str(webAgent) )
if webAgent.get_response_code() == 404:
#do something with this situation
...
There's a bit more documentation inside the class.
If Kiloran or anyone is interested in trying it, here's is a test file with the present version embedded (run it with: 'python3 test_file.py' (or whatever you save it as). All the output appears in the logfile 'output.log'.
Code: Select all
###########################################################################
import logging
LOGFILE = 'output.log'
LOGFORMAT = '%(asctime)s %(levelname)5s [%(lineno)4s - %(funcName)-15s] %(message)s'
LOGLEVEL = logging.DEBUG
Logger = logging.getLogger()
Logger.setLevel(LOGLEVEL)
tmp = logging.FileHandler(LOGFILE)
tmp.setFormatter(logging.Formatter(LOGFORMAT))
Logger.addHandler(tmp)
del tmp
Logger.info("Start")
###########################################################################
import sys
try:
#Python3
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlencode
except:
#Python2
from urllib2 import Request, urlopen, URLError, HTTPError
from urllib import urlencode
Logger.info("Python %s", sys.version)
###########################################################################
try:
import ParameterDict
except:
pass
###########################################################################
class WebAgent(object):
"""
Class to access web pages. Wraps Python3 or Python2 implementation details.
Keeps a record of the request:
(1) requested url, (2) real url retrieved, (3) HTTP response code,
(4) returned header info, (5) number of tries, (6) maximum tries allowed,
(7) last request timeout, (8) last exception message, (9) the page itself.
Constructor and usage:
webAgent = WebAgent()
html = webAgent.fetch(url)
if webAgent.fetch_ok():
print(html)
else:
print( "Diagnostics:\n" + str(webAgent) )
if webAgent.get_response_code() == 404:
#do something with this situation
...
Public methods:
fetch(url) : fetches and returns the web page:
returns 'no response' if URL cannot be retrieved after preset retries
and timeouts, or is invalid.
fetched_ok() : returns True/False as fetch succeeded/failed
get_html() : returns already fetched web page;
get_response_code() : returns HTTP response code as integer (200, 404, etc.)
get_url() : returns original URL
get_real_url() : returns real URL retrieved
get_error() : returns exception/error condition
get_info() : rerturns server headers as a dict (Content-Type, etc.)
See: https://docs.python.org/3.4/howto/urllib2.html (Python3)
https://docs.python.org/2/howto/urllib2.html (Python2)
"""
Header = {
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
}
Deft_Html = 'no response'
Deft_Timeout = 10
MaxTries = 5
NoError = None
def __init__(self, paramDict=None):
self.params = paramDict
self.state = {
'url': None, #supplied url
'geturl': None, #actual url retrieved (possible redirect)
'status': None, #http response code (200, 404, etc)
'error': None, #exception raised with error message of last try
'tries': None, #number of tries
'timeout': None, #timeout of last try
'info': None, #meta info
'html': None, #the retrieved page
}
self._reset_state()
def _reset_state(self, url=None):
self.state['url'] = url
self.state['geturl'] = None
self.state['status'] = None
self.state['error'] = self.NoError
self.state['html'] = self.Deft_Html
self.state['info'] = {}
self.state['tries'] = 0
try:
self.state['timeout'] = self.params['webTimeOut']
if self.state['timeout'] < 1:
self.state['timeout'] = self.Deft_Timeout
except Exception as e:
Logger.debug(e)
self.state['timeout'] = self.Deft_Timeout
def fetch(self, url):
if not isinstance(url, str):
self.state['error'] = "URL must be a string '{}'".format(str(url))
return self.state['html']
self._reset_state(url)
self.state['tries'] = 1
while True:
Logger.debug('try {!s}/{!s}/{!s}'.format(self.state['tries'],
self.MaxTries,
self.state['timeout']))
try:
req = Request(url, headers=self.Header)
response = urlopen(req, None, self.state['timeout']) #with timeout
except HTTPError as e:
self.state['error'] = 'HTTPError: ' + str(e.code)
except URLError as e:
self.state['error'] = 'URLError: ' + str(e.reason)
except Exception as e:
self.state['error'] = 'Exception: ' + str(e)
else:
self.state['status'] = response.getcode()
self.state['geturl'] = response.geturl()
self.state['info'] = response.info()
self.state['html'] = response.read().decode('utf-8', 'ignore')
self.state['error'] = self.NoError #cleanup
break #got something
Logger.error(self.state['error'])
if self.state['tries'] >= self.MaxTries:
break #give up
self.state['timeout'] *= 2
self.state['tries'] += 1
return self.state['html']
#pretty-print the object diagnostics for print() and str() calls
def __str__(self):
triesmaxtime = '{!s}/{!s}/{!s}'.format(
self.state['tries'], self.MaxTries, self.state['timeout'])
s = (" {:<}: {!s}" * 6)[2:]
s = s.format(
'status', self.state['status'],
'tries/max/timeout', triesmaxtime,
'error', self.state['error'],
'url', self.state['url'],
'geturl', self.state['geturl'],
'info', sorted(self.state['info'].items()),
)
return s
def fetched_ok(self):
if self.state['status'] == 200: return True
if self.state['html'] != self.Deft_Html: return True
if self.state['error'] == self.NoError: return True
return False
def get_html(self): return self.state['html']
def get_response_code(self): return self.state['status']
def get_url(self): return self.state['url']
def get_real_url(self): return self.state['geturl']
def get_error(self): return self.state['error']
def get_info(self): return self.state['info']
###########################################################################
if __name__ == '__main__':
def make_url(url, values):
if url is not None and values is not None:
return url + '?' + urlencode(values)
return url
p = ParameterDict.ParameterDict(None)
o = WebAgent(p) #create once, then reuse
Logger.info(sorted(p.items()))
url = 1234
Logger.debug('')
Logger.debug('Trying: ' + str(url))
o.fetch(url)
Logger.debug(o)
Logger.debug(o.get_html())
url = 'this is garbage'
Logger.debug('')
Logger.debug('Trying: ' + url)
o.fetch(url)
Logger.debug(o)
Logger.debug(o.get_html())
url = make_url('https://valid.looking.url', [('d',4), ('e',5), ('f',6)])
Logger.debug('')
Logger.debug('Trying: ' + url)
o.fetch(url)
Logger.debug(o)
Logger.debug(o.get_html())
url = 'https://query1.finance.yahoo.com/v7/finance/quote?symbols=^FTSE,^FTAS'
Logger.debug('')
Logger.debug('Trying: ' + url)
o.fetch(url)
Logger.debug(o)
Logger.debug(o.get_html())
Logger.info('End')
###########################################################################
cheers
desmid