第2篇:CPython实现原理:整数对象
在CPython中的整数对象的堆内存分配并非在即时对某个需要使用的整数分配内存的,因为这样势必对CPython的内存利用率非常底下。而是有一套非常高效的内存管理方案就是针对整数对象-缓冲池机制。我们知道在CPython的内存管理模型中,每个内建对象都有自己独有的对象池机制。而本篇我们恰好讲解整数对象缓存池。
首先针对单个整数PyLongObject对象的表示法,CPython3.9有明确的定义
....
/* Long integer representation.
The absolute value of a number is equal to
SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i)
Negative numbers are represented with ob_size < 0;
zero is represented by ob_size == 0.
In a normalized number, ob_digit[abs(ob_size)-1] (the most significant
digit) is never zero. Also, in all cases, for all valid i,
0 <= ob_digit[i] <= MASK.
The allocation function takes care of allocating extra memory
so that ob_digit[0] ... ob_digit[abs(ob_size)-1] are actually available.
CAUTION: Generic code manipulating subtypes of PyVarObject has to
aware that ints abuse ob_size's sign bit.
*/
struct _longobject {
PyObject_VAR_HEAD
digit ob_digit[1];
};
....
typedef struct _longobject PyLongObject;
其最终形式,等价如下
....
struct _longobject {
PyObject ob_base;
Py_ssize_t ob_size; /* Number of items in variable part */
digit ob_digit[1];
};
....
typedef struct _longobject PyLongObject;
我们从源代码的注释中得到一些关键的信息,整数的绝对值等于如下表达式
SUM(for i = 0 through abs(ob_size)-1)ob_digit [i] * 2 **(SHIFT * i)
如果上面的表达式,我们知道ob_size的绝对值是控制ob_digit数组的长度,而SHIFT有一个宏常量PyLong_SHIFT定义(下文会提到)。
- 负数用ob_size <0表示;
- 0由ob_size == 0表示
- 以标准化数字ob_digit [abs(ob_size)-1](最高有效数字)永远不会为0。 而且,在所有情况下,对于所有有效的i,0 <= ob_digit [i] <=掩码
- 内存分配函数负责分配额外的内存,因此ob_digit [0]到ob_digit [abs(ob_size)-1]实际上可用的有效负载部分。
综上所述,对于大整数在CPython3.x中的有效负载的存储形式如下图
而对于整数对象的存储方式,CPython3.x中就规定使用2**PyLong_SHIFT进制的字符串来表示大整数,而PyLong_SHIFT的定义在Include/longintrepr.h中找到,PyLong_SHIFT的可能的默认值是30或15,分别表示
- 把30位大小的数值存储在uint32_t类型的ob_digit数组中
- 把15位大小的数值存储在uint32_t类型的ob_digit数组中
#if PYLONG_BITS_IN_DIGIT == 30
typedef uint32_t digit;
typedef int32_t sdigit; /* signed variant of digit */
typedef uint64_t twodigits;
typedef int64_t stwodigits; /* signed variant of twodigits */
#define PyLong_SHIFT 30
#define _PyLong_DECIMAL_SHIFT 9 /* max(e such that 10**e fits in a digit) */
#define _PyLong_DECIMAL_BASE ((digit)1000000000) /* 10 ** DECIMAL_SHIFT */
#elif PYLONG_BITS_IN_DIGIT == 15
typedef unsigned short digit;
typedef short sdigit; /* signed variant of digit */
typedef unsigned long twodigits;
typedef long stwodigits; /* signed variant of twodigits */
#define PyLong_SHIFT 15
#define _PyLong_DECIMAL_SHIFT 4 /* max(e such that 10**e fits in a digit) */
#define _PyLong_DECIMAL_BASE ((digit)10000) /* 10 ** DECIMAL_SHIFT */
#else
#error "PYLONG_BITS_IN_DIGIT should be 15 or 30"
#endif
#define PyLong_BASE ((digit)1 << PyLong_SHIFT)
#define PyLong_MASK ((digit)(PyLong_BASE - 1))
#if PyLong_SHIFT % 5 != 0
#error "longobject.c requires that PyLong_SHIFT be divisible by 5"
#endif
那么究竟如何表示一个任意的大型整数呢?事实上,可以参考如下的代码
PyObject *
PyLong_FromLong(long ival)
{
PyLongObject *v;
unsigned long abs_ival;
unsigned long t; /* unsigned so >> doesn't propagate sign bit */
int ndigits = 0;
int sign;
if (IS_SMALL_INT(ival)) {
return get_small_int((sdigit)ival);
}
if (ival < 0) {
/* negate: can't write this as abs_ival = -ival since that
invokes undefined behaviour when ival is LONG_MIN */
abs_ival = 0U-(unsigned long)ival;
sign = -1;
}
else {
abs_ival = (unsigned long)ival;
sign = ival == 0 ? 0 : 1;
}
/* Fast path for single-digit ints */
if (!(abs_ival >> PyLong_SHIFT)) {
v = _PyLong_New(1);
if (v) {
Py_SET_SIZE(v, sign);
v->ob_digit[0] = Py_SAFE_DOWNCAST(
abs_ival, unsigned long, digit);
}
return (PyObject*)v;
}
#if PyLong_SHIFT==15
/* 2 digits */
if (!(abs_ival >> 2*PyLong_SHIFT)) {
v = _PyLong_New(2);
if (v) {
Py_SET_SIZE(v, 2 * sign);
v->ob_digit[0] = Py_SAFE_DOWNCAST(
abs_ival & PyLong_MASK, unsigned long, digit);
v->ob_digit[1] = Py_SAFE_DOWNCAST(
abs_ival >> PyLong_SHIFT, unsigned long, digit);
}
return (PyObject*)v;
}
#endif
/* Larger numbers: loop to determine number of digits */
t = abs_ival;
while (t) {
++ndigits;
t >>= PyLong_SHIFT;
}
v = _PyLong_New(ndigits);
if (v != NULL) {
digit *p = v->ob_digit;
Py_SET_SIZE(v, ndigits * sign);
t = abs_ival;
while (t) {
*p++ = Py_SAFE_DOWNCAST(
t & PyLong_MASK, unsigned long, digit);
t >>= PyLong_SHIFT;
}
}
return (PyObject *)v;
}
整数对象的初始化
整数对象的创建,我们在前一篇已经说过创建一个PyLongObject需要引用与该类型匹配的PyLong_Type实例,我们查看一下PyLong_Type实例的初始化代码,它位于Objects/longobject.c源文件,
PyTypeObject PyLong_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"int", /* tp_name */
offsetof(PyLongObject, ob_digit), /* tp_basicsize */
sizeof(digit), /* tp_itemsize */
....
long_to_decimal_string, /* tp_repr */
&long_as_number, /* tp_as_number */
....
(hashfunc)long_hash, /* tp_hash */
....
PyObject_GenericGetAttr, /* tp_getattro */
....
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_LONG_SUBCLASS, /* tp_flags */
long_doc, /* tp_doc */
....
long_richcompare, /* tp_richcompare */
....
long_methods, /* tp_methods */
0, /* tp_members */
long_getset, /* tp_getset */
....
long_new, /* tp_new */
PyObject_Del, /* tp_free */
};
注意PyLong对象的实例化需要调用PyLong_Type实例的tp_new字段相关的参数,它是一个long_new的函数指针,那么long_new函数的具体定义
static PyObject *
long_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
PyObject *return_value = NULL;
static const char * const _keywords[] = {"", "base", NULL};
static _PyArg_Parser _parser = {NULL, _keywords, "int", 0};
PyObject *argsbuf[2];
PyObject * const *fastargs;
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 0;
PyObject *x = NULL;
PyObject *obase = NULL;
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 0, 2, 0, argsbuf);
if (!fastargs) {
goto exit;
}
if (nargs < 1) {
goto skip_optional_posonly;
}
noptargs--;
x = fastargs[0];
skip_optional_posonly:
if (!noptargs) {
goto skip_optional_pos;
}
obase = fastargs[1];
skip_optional_pos:
return_value = long_new_impl(type, x, obase);
exit:
return return_value;
}
而long_new函数其实最终是调用long_new_impl函数,其具体定义在源文件的Objects/longobject.c源文件
static PyObject *
long_new_impl(PyTypeObject *type, PyObject *x, PyObject *obase)
/*[clinic end generated code: output=e47cfe777ab0f24c input=81c98f418af9eb6f]*/
{
Py_ssize_t base;
if (type != &PyLong_Type)
return long_subtype_new(type, x, obase); /* Wimp out */
if (x == NULL) {
if (obase != NULL) {
PyErr_SetString(PyExc_TypeError,
"int() missing string argument");
return NULL;
}
return PyLong_FromLong(0L);
}
if (obase == NULL)
return PyNumber_Long(x);
base = PyNumber_AsSsize_t(obase, NULL);
if (base == -1 && PyErr_Occurred())
return NULL;
if ((base != 0 && base < 2) || base > 36) {
PyErr_SetString(PyExc_ValueError,
"int() base must be >= 2 and <= 36, or 0");
return NULL;
}
if (PyUnicode_Check(x))
return PyLong_FromUnicodeObject(x, (int)base);
else if (PyByteArray_Check(x) || PyBytes_Check(x)) {
const char *string;
if (PyByteArray_Check(x))
string = PyByteArray_AS_STRING(x);
else
string = PyBytes_AS_STRING(x);
return _PyLong_FromBytes(string, Py_SIZE(x), (int)base);
}
else {
PyErr_SetString(PyExc_TypeError,
"int() can't convert non-string with explicit base");
return NULL;
}
}
更新中