Source code for tradingWithPython.lib.yahooFinance

# -*- coding: utf-8 -*-

# Author: Jev Kuznetsov <jev.kuznetsov@gmail.com>
# License: BSD


"""

.. ipython:: python
    :suppress:
        
    import pandas as pd
    np.set_printoptions(precision=2, suppress=True)
    pd.options.display.max_rows= 6


Yahoo Finance 
====================


This module enables easy access to data provided by Yahoo Finance.

.. note::
    
    This service may stop without notice, Yahoo does not seem to like people accessing their
    data automatically. Breaking the service already happened in early 2017. This module includes 
    a workaround that works ... for now. 


Getting historic data
-----------------------

The module is usually imported as follows:

.. ipython:: python

   from tradingWithPython import yahooFinance as yf

Singe symbol
---------------------
   
Then, to get raw yahoo finance data for a symbol use :func:`~lib.yahooFinance.getSymbolData`

.. ipython:: python

   df = yf.getSymbolData("SPY")
   df.head()
   
We can also normalize OHLC with the *adj_close* data column. After normalization,
the *close* column will be equal to *adj_close* , so the latter is omitted from the result.

.. ipython:: python

    df = yf.getSymbolData("SPY",adjust=True)
    df.head()
    
Multiple symbols
-------------------------

:func:`~lib.yahooFinance.getHistoricData` will accept one ore more symbols and download them
while displaying a progress bar.

.. ipython:: python
    
    symbols = ['XLE','USO','SPY']
    data = yf.getHistoricData(symbols)
    
The data will be a multi-index DataFrame:

.. ipython:: python

    data.columns

To select  a symbol, simply use

.. ipython:: python
    
    data['SPY']
    

Or with cross-section (see `Advanced indexing <https://pandas.pydata.org/pandas-docs/stable/advanced.html>`_)
    
.. ipython:: python

    data.xs('close',level=1,axis=1)
    

    

   
Functions
==========

.. autofunction:: tradingWithPython.lib.yahooFinance.getSymbolData
.. autofunction:: tradingWithPython.lib.yahooFinance.getHistoricData

"""



import urllib.request
import numpy as np

import requests # interaction with the web
import os  #  file system operations
import yaml # human-friendly data format
import re  # regular expressions
import pandas as pd # pandas... the best time series library out there
import datetime as dt # date and time functions
import io

from tradingWithPython.lib.extra import ProgressBar

dateTimeFormat = "%Y%m%d %H:%M:%S"

def parseStr(s):
    ''' convert string to a float or string '''
    f = s.strip()
    if f[0] == '"':
        return f.strip('"')
    elif f=='N/A':
        return np.nan

    else:
        try: # try float conversion
            prefixes = {'M':1e6, 'B': 1e9}
            prefix = f[-1]

            if prefix in prefixes: # do we have a Billion/Million character?
                return float(f[:-1])*prefixes[prefix]
            else:                       # no, convert to float directly
                return float(f)
        except ValueError: # failed, return original string
            return s



#def getQuote(symbols): has been disabled by Yahoo :-(
 

[docs]def getHistoricData(symbols, **options):
    '''
    get data from Yahoo finance and return pandas dataframe
    Will get OHLCV data frame if sinle symbol is provided.
    If many symbols are provided, it will return a wide panel

    Parameters
    ------------
    symbols : str or list
        Yahoo finanance symbol or a list of symbols
    sDate : tuple  (optional)
        start date (y,m,d)
    adjust : bool
        T/[F] adjust data based on adj_close

    Returns
    ---------
    DataFrame, multi-index

    '''

    assert isinstance(symbols,(list,str)), 'Input must be a string symbol or a list of symbols'

    if isinstance(symbols,str):
        return getSymbolData(symbols,**options)
    else:
        data = {}
        print('Downloading data:')
        p = ProgressBar(len(symbols))
        for idx,symbol in enumerate(symbols):
            p.animate(idx+1)
            data[symbol] = getSymbolData(symbol,verbose=False,**options)

        return pd.concat(data,axis=1, names=['symbol','ohlcv'])

[docs]def getSymbolData(symbol, sDate=(2000,1,1), adjust=False, verbose=True, dumpDest=None):
    """
    get data from Yahoo finance and return pandas dataframe

    Parameters
    -----------
    symbol : str
        Yahoo finanance symbol
    sDate : tuple , default (2000,1,1)
        start date (y,m,d)
    adjust : bool , default False
        use adjusted close values to correct OHLC. adj_close will be ommited
    verbose : bool , default True
        print output
    dumpDest : str, default None
        dump raw data for debugging

    Returns
    ---------
        DataFrame

    """

    period1 = int(dt.datetime(*sDate).timestamp()) # convert to seconds since epoch
    period2 = int(dt.datetime.now().timestamp())

    params = (symbol, period1, period2, _token['crumb'])

    url = "https://query1.finance.yahoo.com/v7/finance/download/{0}?period1={1}&period2={2}&interval=1d&events=history&crumb={3}".format(*params)

    data = requests.get(url, cookies={'B':_token['cookie']})
    data.raise_for_status() # raise error in case of bad request

    if dumpDest is not None:
        fName = symbol+'_dump.csv'
        with open(os.path.join(dumpDest, fName),'w') as fid:
            fid.write(data.text)

    buf = io.StringIO(data.text) # create a buffer
    df = pd.read_csv(buf,index_col=0,parse_dates=True, na_values=['null']) # convert to pandas DataFrame

    # rename columns
    newNames = [c.lower().replace(' ','_') for c in df.columns]
    renames = dict(zip(df.columns,newNames))
    df = df.rename(columns=renames)

    if verbose:
        print(('Got %i days of data' % len(df)))

    if adjust:
        return _adjust(df,removeOrig=True).round(2)
    else:
        return df.round(2)

def _adjust(df, removeOrig=False):
    '''
  _adjustust hist data based on adj_close field
    '''
    c = df['close']/df['adj_close']

    df['adj_open'] = df['open']/c
    df['adj_high'] = df['high']/c
    df['adj_low'] = df['low']/c

    if removeOrig:
        df=df.drop(['open','close','high','low'],axis=1)
        renames = dict(list(zip(['adj_open','adj_close','adj_high','adj_low'],['open','close','high','low'])))
        df=df.rename(columns=renames)

    return df


def loadToken():
    """
    get cookie and crumb from APPL page or disk.
    force = overwrite disk data
    """
    refreshDays = 30 # refreh cookie every x days

    # set destinatioin file
    dataDir = os.path.expanduser('~')+'/twpData'
    dataFile = dataFile = os.path.join(dataDir,'yahoo_cookie.yml')

    try : # load file from disk

        data = yaml.load(open(dataFile,'r'))
        age = (dt.datetime.now()- dt.datetime.strptime(  data['timestamp'], dateTimeFormat) ).days
        assert age < refreshDays, 'cookie too old'

    except (AssertionError,FileNotFoundError):     # file not found

        if not os.path.exists(dataDir):
            os.mkdir(dataDir)

        data = getToken(dataFile)

    return data


def getToken(fName = None):
    """ get cookie and crumb from yahoo """

    url = 'https://uk.finance.yahoo.com/quote/AAPL/history' # url for a ticker symbol, with a download link
    r = requests.get(url)  # download page

    txt = r.text # extract html

    cookie = r.cookies['B'] # the cooke we're looking for is named 'B'

    pattern = re.compile('.*"CrumbStore":\{"crumb":"(?P<crumb>[^"]+)"\}')

    for line in txt.splitlines():
        m = pattern.match(line)
        if m is not None:
            crumb = m.groupdict()['crumb']

    assert r.status_code == 200 # check for succesful download

    # save to disk
    data = {'crumb': crumb, 'cookie':cookie, 'timestamp':dt.datetime.now().strftime(dateTimeFormat)}

    if fName  is not None: # save to file
        with open(fName,'w') as fid:
            yaml.dump(data,fid)

    return data


#-------------- get token
_token = loadToken() # get token from disk or yahoo

#--------------tests------------
# to be executed with pytest


def test_getToken():
    ''' download token '''
    print('getting token')
    getToken()


def test_initToken():
    ''' remove and get token '''
    dataDir = os.path.expanduser('~')+'/twpData'
    dataFile = dataFile = os.path.join(dataDir,'yahoo_cookie.yml')

    if os.path.exists(dataFile):
        os.remove(dataFile)

    loadToken()

    assert os.path.exists(dataFile)

def test_download():

    vxx = getSymbolData('SPY')

    assert len(vxx) > 4000