"""Forest covertype dataset. A classic dataset for classification benchmarks, featuring categorical and real-valued features. The dataset page is available from UCI Machine Learning Repository https://archive.ics.uci.edu/ml/datasets/Covertype Courtesy of Jock A. Blackard and Colorado State University. """ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause import logging import os from gzip import GzipFile from numbers import Integral, Real from os.path import exists, join from tempfile import TemporaryDirectory import joblib import numpy as np from ..utils import Bunch, check_random_state from ..utils._param_validation import Interval, validate_params from . import get_data_home from ._base import ( RemoteFileMetadata, _convert_data_dataframe, _fetch_remote, _pkl_filepath, load_descr, ) # The original data can be found in: # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz ARCHIVE = RemoteFileMetadata( filename="covtype.data.gz", url="https://ndownloader.figshare.com/files/5976039", checksum="614360d0257557dd1792834a85a1cdebfadc3c4f30b011d56afee7ffb5b15771", ) logger = logging.getLogger(__name__) # Column names reference: # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info FEATURE_NAMES = [ "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", ] FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)] FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)] TARGET_NAMES = ["Cover_Type"] @validate_params( { "data_home": [str, os.PathLike, None], "download_if_missing": ["boolean"], "random_state": ["random_state"], "shuffle": ["boolean"], "return_X_y": ["boolean"], "as_frame": ["boolean"], "n_retries": [Interval(Integral, 1, None, closed="left")], "delay": [Interval(Real, 0.0, None, closed="neither")], }, prefer_skip_nested_validation=True, ) def fetch_covtype( *, data_home=None, download_if_missing=True, random_state=None, shuffle=False, return_X_y=False, as_frame=False, n_retries=3, delay=1.0, ): """Load the covertype dataset (classification). Download it if necessary. ================= ============ Classes 7 Samples total 581012 Dimensionality 54 Features int ================= ============ Read more in the :ref:`User Guide `. Parameters ---------- data_home : str or path-like, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : bool, default=True If False, raise an OSError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. shuffle : bool, default=False Whether to shuffle dataset. return_X_y : bool, default=False If True, returns ``(data.data, data.target)`` instead of a Bunch object. .. versionadded:: 0.20 as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). The target is a pandas DataFrame or Series depending on the number of target columns. If `return_X_y` is True, then (`data`, `target`) will be pandas DataFrames or Series as described below. .. versionadded:: 0.24 n_retries : int, default=3 Number of retries when HTTP errors are encountered. .. versionadded:: 1.5 delay : float, default=1.0 Number of seconds between retries. .. versionadded:: 1.5 Returns ------- dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray of shape (581012, 54) Each row corresponds to the 54 features in the dataset. target : ndarray of shape (581012,) Each value corresponds to one of the 7 forest covertypes with values ranging between 1 to 7. frame : dataframe of shape (581012, 55) Only present when `as_frame=True`. Contains `data` and `target`. DESCR : str Description of the forest covertype dataset. feature_names : list The names of the dataset columns. target_names: list The names of the target columns. (data, target) : tuple if ``return_X_y`` is True A tuple of two ndarray. The first containing a 2D array of shape (n_samples, n_features) with each row representing one sample and each column representing the features. The second ndarray of shape (n_samples,) containing the target samples. .. versionadded:: 0.20 Examples -------- >>> from sklearn.datasets import fetch_covtype >>> cov_type = fetch_covtype() >>> cov_type.data.shape (581012, 54) >>> cov_type.target.shape (581012,) >>> # Let's check the 4 first feature names >>> cov_type.feature_names[:4] ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology'] """ data_home = get_data_home(data_home=data_home) covtype_dir = join(data_home, "covertype") samples_path = _pkl_filepath(covtype_dir, "samples") targets_path = _pkl_filepath(covtype_dir, "targets") available = exists(samples_path) and exists(targets_path) if download_if_missing and not available: os.makedirs(covtype_dir, exist_ok=True) # Creating temp_dir as a direct subdirectory of the target directory # guarantees that both reside on the same filesystem, so that we can use # os.rename to atomically move the data files to their target location. with TemporaryDirectory(dir=covtype_dir) as temp_dir: logger.info(f"Downloading {ARCHIVE.url}") archive_path = _fetch_remote( ARCHIVE, dirname=temp_dir, n_retries=n_retries, delay=delay ) Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",") X = Xy[:, :-1] y = Xy[:, -1].astype(np.int32, copy=False) samples_tmp_path = _pkl_filepath(temp_dir, "samples") joblib.dump(X, samples_tmp_path, compress=9) os.rename(samples_tmp_path, samples_path) targets_tmp_path = _pkl_filepath(temp_dir, "targets") joblib.dump(y, targets_tmp_path, compress=9) os.rename(targets_tmp_path, targets_path) elif not available and not download_if_missing: raise OSError("Data not found and `download_if_missing` is False") try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] fdescr = load_descr("covtype.rst") frame = None if as_frame: frame, X, y = _convert_data_dataframe( caller_name="fetch_covtype", data=X, target=y, feature_names=FEATURE_NAMES, target_names=TARGET_NAMES, ) if return_X_y: return X, y return Bunch( data=X, target=y, frame=frame, target_names=TARGET_NAMES, feature_names=FEATURE_NAMES, DESCR=fdescr, )