3.6. Array Import

3.6. Array Import — Python

3.6.1. SetUp

3.6.2. np.loadtxt()

>>> DATA = 'https://python3.info/_static/iris.csv'

>>> a = np.loadtxt(DATA)
Traceback (most recent call last):
ValueError: could not convert string 'sepal_length,sepal_width,petal_length,petal_width,species' to float64 at row 0, column 1.

>>> a = np.loadtxt(DATA, skiprows=1)
Traceback (most recent call last):
ValueError: could not convert string '5.4,3.9,1.3,0.4,setosa' to float64 at row 0, column 1.

>>> a = np.loadtxt(DATA, skiprows=1, delimiter=',')
Traceback (most recent call last):
ValueError: could not convert string 'setosa' to float64 at row 0, column 5.

>>> a = np.loadtxt(DATA, skiprows=1, delimiter=',', max_rows=5, usecols=(0,1,2,3))
>>> a
array([[5.4, 3.9, 1.3, 0.4],
       [5.9, 3. , 5.1, 1.8],
       [6. , 3.4, 4.5, 1.6],
       [7.3, 2.9, 6.3, 1.8],
       [5.6, 2.5, 3.9, 1.1]])

>>> header = np.loadtxt(DATA, max_rows=1, delimiter=',', dtype=str, usecols=(0,1,2,3))
>>> data = np.loadtxt(DATA, skiprows=1, max_rows=3, delimiter=',', usecols=(0,1,2,3))
>>>
>>> header
array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='<U12')
>>>
>>> data
array([[5.4, 3.9, 1.3, 0.4],
       [5.9, 3. , 5.1, 1.8],
       [6. , 3.4, 4.5, 1.6]])

3.6.3. Other

Table 3.5. NumPy Import methods
Method	Data Type	Description
`np.loadtxt()`	Text	Load data from text file such as `.csv`
`np.load()`	Binary	Load data from `.npy` file
`np.loads()`	Binary	Load binary data from `pickle` string
`np.fromstring()`	Text	Load data from string
`np.fromregex()`	Text	Load data from file using regex to parse
`np.genfromtxt()`	Text	Load data with missing values handled as specified
`scipy.io.loadmat()`	Binary	reads MATLAB data files

>>> #
... data = np.loadtxt('/tmp/myfile.csv', delimiter=',', usecols=1, skiprows=1, dtype=np.float16)
...
... small = (data < 1)
... medium = (data < 1) & (data < 2.0)
... large = (data < 2)
...
... np.save('/tmp/small', data[small])
... np.save('/tmp/medium', data[medium])
... np.save('/tmp/large', data[large])

3.6.4. Use Case - 1

>>> header = np.loadtxt(DATA, max_rows=1, dtype='str', delimiter=',', usecols=(0,1,2,3))
>>> values = np.loadtxt(DATA, skiprows=1, dtype='float', delimiter=',', usecols=(0,1,2,3))
>>> species = np.loadtxt(DATA, skiprows=1, dtype='str', delimiter=',', usecols=4)
>>>
>>> sepal_length = (header == 'sepal_length')
>>> sepal_width = (header == 'sepal_width')
>>> petal_length = (header == 'petal_length')
>>> petal_width = (header == 'petal_width')
>>>
>>> setosa = (species == 'setosa')
>>> versicolor = (species == 'versicolor')
>>> virginica = (species == 'virginica')

Then you can query your data using previously defined identifiers (queries):

>>> values[setosa, sepal_length]
array([5.4, 5.4, 4.9, 5.1, 4.6, 5.2, 5.2, 5.1, 4.8, 4.9, 4.3, 5. , 5.4,
       5.1, 4.8, 4.8, 4.4, 5.1, 4.6, 5.5, 5. , 5.7, 5.4, 4.8, 5. , 5.1,
       4.9, 5. , 4.6, 4.9, 5.1, 4.7, 5.7, 4.4, 5.4, 4.5, 5. , 5.3, 5.1,
       5. , 5.8, 5.2, 4.6, 4.8, 4.4, 5.4, 5. , 4.7, 5.1, 5.5, 5. ])

>>> values[setosa, sepal_length].mean()
np.float64(5.013725490196078)

>>> values[setosa, sepal_length].mean().round(2)
np.float64(5.01)

3.6.5. Assignments

# %% About
# - Name: Numpy Loadtext
# - Difficulty: easy
# - Lines: 4
# - Minutes: 5

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Load text from `DATA`
# 2. Define variables:
#    - `species: np.ndarray[str]` - first row, columns 2, 3, 4
#    - `features: np.ndarray[float]` - all rows except the first one, columns 0, 1, 2, 3
#    - `labels: np.ndarray[int]` - all rows except the first one, column 4
# 3. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj tekst z `DATA`
# 2. Zdefiniuj zmienne:
#    - `species: np.ndarray[str]` - pierwszy wiersz, kolumny 2, 3, 4
#    - `features: np.ndarray[float]` - wszystkie wiersze poza pierwszym, kolumny 0, 1, 2, 3
#    - `labels: np.ndarray[int]` - wszystkie wiersze poza pierwszym, kolumna 4
# 3. Uruchom doctesty - wszystkie muszą się powieść

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert species is not Ellipsis, \
'Variable `species` has an invalid value; assign result of your program to it.'

>>> assert labels is not Ellipsis, \
'Variable `labels` has an invalid value; assign result of your program to it.'

>>> assert features is not Ellipsis, \
'Variable `features` has an invalid value; assign result of your program to it.'

>>> assert type(species) is np.ndarray, \
'Variable `species` has an invalid type; expected: `np.ndarray`.'

>>> assert type(features) is np.ndarray, \
'Variable `features` has an invalid type; expected: `np.ndarray`.'

>>> assert type(labels) is np.ndarray, \
'Variable `labels` has an invalid type; expected: `np.ndarray`.'

>>> assert species.dtype == np.dtype('<U10'), \
'Variable `species` has an invalid type; expected: `str`.'

>>> assert features.dtype is np.dtype('float64'), \
'Variable `features` has an invalid type; expected: `float`.'

>>> assert labels.dtype is np.dtype('int64'), \
'Variable `labels` has an invalid type; expected: `int`.'

>>> assert len(species) == 3, \
'Variable `species` has an invalid length; expected: `3`.'

>>> assert len(features) == 151, \
'Variable `features` has an invalid length; expected: `151`.'

>>> assert len(labels) == 151, \
'Variable `labels` has an invalid length; expected: `151`.'

>>> species
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

>>> features[:3]
array([[5.4, 3.9, 1.3, 0.4],
       [5.9, 3. , 5.1, 1.8],
       [6. , 3.4, 4.5, 1.6]])

>>> features[-3:]
array([[4.9, 2.5, 4.5, 1.7],
       [6.3, 2.8, 5.1, 1.5],
       [6.8, 3.2, 5.9, 2.3]])

>>> labels
array([0, 2, 1, 2, 1, 0, 1, 1, 0, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 1, 0, 1,
       1, 0, 0, 0, 2, 2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2, 1, 1, 1, 2, 2,
       0, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 0, 0,
       0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 1, 2, 2, 1, 0, 2, 1, 0, 1, 0, 2, 1,
       0, 2, 0, 2, 1, 0, 2, 1, 1, 0, 0, 1, 2, 2, 2, 1, 0, 1, 1, 1, 2, 2,
       0, 2, 2, 0, 2, 1, 2, 0, 0, 1, 0, 2, 0, 2, 1, 2, 2, 2, 1, 0, 2, 1,
       0, 0, 2, 0, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 0, 2, 2, 2])
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import numpy as np

# %% Types
species: np.ndarray
features: np.ndarray
labels: np.ndarray

# %% Data
DATA = 'https://python3.info/_static/iris-dirty.csv'

# %% Result
species = ...
features = ...
labels = ...