Skip to content

csvdataset

CSVDataset

A Dataset for handling CSV data.

Parameters:

Name Type Description Default
filename str

The path to the CSV file. Defaults to “data.csv”.

None
directory str

The path to the directory where the CSV file is stored. Defaults to None.

None
feature_type dtype

The data type of the features. Defaults to float.

float
target_column str

The name of the target column. Defaults to “y”.

'y'
target_type dtype

The data type of the targets. Defaults to float.

float
train bool

Whether the dataset is for training or not. Defaults to True.

True
rmNA bool

Whether to remove rows with NA values or not. Defaults to True.

True
dropId bool

Whether to drop the “id” column or not. Defaults to False.

False
**desc Any

Additional keyword arguments.

{}

Examples:

>>> from spotriver.data.csvdataset import CSVDataset
    dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=int)
    print(dataset.data.shape)
Source code in spotriver/data/csvdataset.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class CSVDataset:
    """
    A Dataset for handling CSV data.

    Args:
        filename (str): The path to the CSV file. Defaults to "data.csv".
        directory (str): The path to the directory where the CSV file is stored. Defaults to None.
        feature_type (dtype): The data type of the features. Defaults to float.
        target_column (str): The name of the target column. Defaults to "y".
        target_type (dtype): The data type of the targets. Defaults to float.
        train (bool): Whether the dataset is for training or not. Defaults to True.
        rmNA (bool): Whether to remove rows with NA values or not. Defaults to True.
        dropId (bool): Whether to drop the "id" column or not. Defaults to False.
        **desc (Any): Additional keyword arguments.

    Examples:
        >>> from spotriver.data.csvdataset import CSVDataset
            dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=int)
            print(dataset.data.shape)
    """

    def __init__(
        self,
        filename: str = None,
        directory: None = None,
        feature_type: type = float,
        target_column: str = "y",
        target_type: type = float,
        train: bool = True,
        rmNA=True,
        dropId=False,
        **desc,
    ) -> None:
        # super().__init__()
        self.filename = filename
        self.directory = directory
        self.feature_type = feature_type
        self.target_type = target_type
        self.target_column = target_column
        self.train = train
        self.rmNA = rmNA
        self.dropId = dropId
        self.data = self._load_data()

    @property
    def path(self):
        if self.directory:
            return pathlib.Path(self.directory).joinpath(self.filename)
        return pathlib.Path(__file__).parent.joinpath(self.filename)

    @property
    def _repr_content(self):
        content = super()._repr_content
        content["Path"] = str(self.path)
        return content

    def _load_data(self) -> tuple:
        # print(f"Loading data from {self.path}")
        df = pd.read_csv(self.path, index_col=False)
        # rm rows with NA
        if self.rmNA:
            df = df.dropna()
        if self.dropId:
            df = df.drop(columns=["id"])
        return df