• 隐藏侧边栏
  • 展开分类目录
  • 关注微信公众号
  • 我的GitHub
  • QQ:1753970025
Chen Jiehua

C++字符串日期转时间戳 

目录

将字符串日期转换为整型时间戳,在日常开发中可以说是非常常见的。不过虽然需求简单,但其中也有很多细节值得深究学习,今天我们来看看如何通过C++实现一个Python扩展来进行一些优化。

Python

假设字符串日期格式为:"%Y-%m-%d %H:%M:%S",也即类似 "2023-02-20 17:22:30",那么只需一两行 python 代码就可以完成任务。

可以使用 time 模块:

import time

tp = time.strptime("2023-02-20 17:22:30", "%Y-%m-%d %H:%M:%S")
ts = int(time.mktime(tp))
print ts, ts == 1676884950

或者使用 datetime 模块:

from datetime import datetime

dt = datetime.strptime("2023-02-20 17:22:30", "%Y-%m-%d %H:%M:%S")
ts = int(time.mktime(dt.timetuple()))
print ts, ts == 1676884950

而实际上,time.strptimedatetime.strptime 最终都是调用到了 _strptime.py:strptime(),其中内部使用正则进行日期格式匹配,最后解析出年月日时分秒等信息。具体可以查看源码:

static PyObject *
time_strptime(PyObject *self, PyObject *args)
{
    PyObject *module, *func, *result;
    _Py_IDENTIFIER(_strptime_time);

    module = PyImport_ImportModuleNoBlock("_strptime");
    if (!module)
        return NULL;

    func = _PyObject_GetAttrId(module, &PyId__strptime_time);
    Py_DECREF(module);
    if (!func) {
        return NULL;
    }

    result = PyObject_Call(func, args, NULL);
    Py_DECREF(func);
    return result;
}

在高频调用的情况下,python处理和正则匹配会导致转换性能并不是很高,所以我们决定尝试使用C++来提升一下日期转换的性能。

C++

使用 C++ 可以非常便捷地开发一个新的 Python 模块,并且编译后可以直接在 Python 脚本中导入调用,详细步骤可以参考:使用C/C++扩展Python

Boost regex

假设字符串日期格式为 "yyyy-mm-dd HH:MM:SS",我们可以使用正则匹配:

#include <boost/regex.hpp>

boost::regex pattern(R"((\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}))");

static PyObject *str2ts_re(PyObject *self, PyObject *args) {
    const char *s;
    if (!PyArg_ParseTuple(args, "s", &s)) {
        return nullptr;
    }
    boost::cmatch m;
    if (!boost::regex_match(s, m, pattern) || m.size() != 7) {
        return nullptr;
    }
    std::tm t{
            std::stoi(m[6]),
            std::stoi(m[5]),
            std::stoi(m[4]),
            std::stoi(m[3]),
            std::stoi(m[2]) - 1,
            std::stoi(m[1]) - 1900,
            0,
            0,
            0,
    };
    PyObject *ret = PyInt_FromLong(mktime(&t));
    return ret;
}

编译的时候需要把 boost_regex 也链接上:

target_link_libraries(${PROJECT_NAME} boost_regex)

按位匹配

也可以直接按照日期格式直接取出年月日时分秒:

static PyObject *str2ts_bit(PyObject *self, PyObject *args) {
    const char *s;
    if (!PyArg_ParseTuple(args, "s", &s)) {
        return nullptr;
    }
    try {
        std::string ss = s;
        std::tm t{
                std::stoi(ss.substr(17, 2)),
                std::stoi(ss.substr(14, 2)),
                std::stoi(ss.substr(11, 2)),
                std::stoi(ss.substr(8, 2)),
                std::stoi(ss.substr(5, 2)) -1 ,
                std::stoi(ss.substr(0, 4)) - 1900,
                0,
                0,
                0,
        };
        PyObject *ret = PyInt_FromLong(mktime(&t));
        return ret;
    }
    catch (std::exception &err) {
        PyErr_SetString(PyExc_TypeError, err.what());
        return nullptr;
    }
}

编译so

Python方法定义和初始化:

static PyMethodDef FastTimeMethods[] = {
        {"str2ts_re", str2ts_re, METH_VARARGS, ""},
        {"str2ts_bit", str2ts_bit, METH_VARARGS, ""},
        {nullptr, nullptr, 0, nullptr},
};

PyMODINIT_FUNC initfasttime(void) {
    Py_InitModule("fasttime", FastTimeMethods);
}

CMakeLists.txt:

cmake_minimum_required(VERSION 3.11)
project(fasttime)

set(BOOST_ROOT /home/ubuntu/boost_1_68_0)
set(BOOST_LIB /home/ubuntu/boost_1_68_0/stage/lib)

include_directories(${BOOST_ROOT})
link_directories(${BOOST_LIB})

set(CMAKE_CXX_STANDARD 11)
find_package(PythonLibs 2.7 REQUIRED)
message(STATUS "Python Include=${PYTHON_INCLUDE_DIRS}")
include_directories(${PYTHON_INCLUDE_DIRS})

add_library(${PROJECT_NAME} SHARED library.cpp)
target_link_libraries(${PROJECT_NAME} boost_regex)
set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "")

性能对比

编译之后就可以直接在 python 脚本中调用了,我们对比一下性能。

先随机生成日期,然后统计运行耗时:

def timeit(func):
    def wrap(*args, **kwargs):
        t1 = time.time()
        func(*args, **kwargs)
        t2 = time.time()
        print "func: %s, timeit: %.6f" % (func.__name__, t2 - t1)
    return wrap

def generate_sample(n):
    result = []
    for i in range(n):
        result.append("%04d-%02d-%02d %02d:%02d:%02d" % (
            random.choice(range(2020, 2025)),
            random.choice(range(1, 13)),
            random.choice(range(1, 29)),
            random.choice(range(24)),
            random.choice(range(59)),
            random.choice(range(59)),
        ))
    return result

三种实现方式比较:

import fasttime

@timeit
def convert_by_python(samples):
    for s in samples:
        int(time.mktime(time.strptime(s, "%Y-%m-%d %H:%M:%S")))

@timeit
def convert_by_cpp_re(samples):
    for s in samples:
        fasttime.str2ts_re(s)

@timeit
def convert_by_cpp_bit(samples):
    for s in samples:
        fasttime.str2ts_bit(s)

测试结果:

benchmark N = 100000
func: convert_by_python, timeit: 0.873704
func: convert_by_cpp_re, timeit: 0.290680
func: convert_by_cpp_bit, timeit: 0.214419

有点出乎意料,C++实现的性能竟然只提升了这么一点?

mktime

简单分析了一下,发现主要耗时都在C++ 的 mktime 函数调用上了,看一下 mktime.c 的源码:

/* Convert *TP to a __time64_t value.  */
__time64_t
__mktime64 (struct tm *tp)
{
  /* POSIX.1 8.1.1 requires that whenever mktime() is called, the
     time zone names contained in the external variable 'tzname' shall
     be set as if the tzset() function had been called.  */
  __tzset ();
# if defined _LIBC || NEED_MKTIME_WORKING
  static mktime_offset_t localtime_offset;
  return __mktime_internal (tp, __localtime64_r, &localtime_offset);
# else
#  undef mktime
  return mktime (tp);
# endif
}
#endif /* _LIBC || NEED_MKTIME_WORKING || NEED_MKTIME_WINDOWS */
#if defined _LIBC && __TIMESIZE != 64
libc_hidden_def (__mktime64)
time_t
mktime (struct tm *tp)
{
  struct tm tm = *tp;
  __time64_t t = __mktime64 (&tm);
  if (in_time_t_range (t))
    {
      *tp = tm;
      return t;
    }
  else
    {
      __set_errno (EOVERFLOW);
      return -1;
    }
}

可以看到每次调用前都需要进行时区设置 __tzset(),再看一下 tzset.c 的源码:

/* Interpret the TZ envariable.  */
static void
 tzset_internal (int always)
{
  static int is_initialized;
  const char *tz;
  if (is_initialized && !always)
    return;
  is_initialized = 1;
  /* Examine the TZ environment variable.  */
  tz = getenv ("TZ");
  if (tz && *tz == '\0')
    /* User specified the empty string; use UTC explicitly.  */
    tz = "Universal";
  /* A leading colon means "implementation defined syntax".
     We ignore the colon and always use the same algorithm:
     try a data file, and if none exists parse the 1003.1 syntax.  */
  if (tz && *tz == ':')
    ++tz;
  /* Check whether the value changed since the last run.  */
  if (old_tz != NULL && tz != NULL && strcmp (tz, old_tz) == 0)
    /* No change, simply return.  */
    return;
  if (tz == NULL)
    /* No user specification; use the site-wide default.  */
    tz = TZDEFAULT;
  tz_rules[0].name = NULL;
  tz_rules[1].name = NULL;
  /* Save the value of `tz'.  */
  free (old_tz);
  old_tz = tz ? __strdup (tz) : NULL;
  /* Try to read a data file.  */
  __tzfile_read (tz, 0, NULL);
  if (__use_tzfile)
    return;
  /* No data file found.  Default to UTC if nothing specified.  */
  if (tz == NULL || *tz == '\0'
      || (TZDEFAULT != NULL && strcmp (tz, TZDEFAULT) == 0))
    {
      memset (tz_rules, '\0', sizeof tz_rules);
      tz_rules[0].name = tz_rules[1].name = "UTC";
      if (J0 != 0)
	tz_rules[0].type = tz_rules[1].type = J0;
      tz_rules[0].change = tz_rules[1].change = -1;
      update_vars ();
      return;
    }
  __tzset_parse_tz (tz);
}

void
 __tzset (void)
{
  __libc_lock_lock (tzset_lock);
  tzset_internal (1);
  if (!__use_tzfile)
    {
      /* Set `tzname'.  */
      __tzname[0] = (char *) tz_rules[0].name;
      __tzname[1] = (char *) tz_rules[1].name;
    }
  __libc_lock_unlock (tzset_lock);
}

可以看到 __tzset() 用到了锁,之后它会读取环境变量TZ的值以及读取文件 __tzfile_read()。

如果我们提前设置好环境变量 TZ ,性能应该有所提升:

import os
os.environ["TZ"] = "Asia/Shanghai"

再次进行测试,确实比之前有所提升:

benchmark N = 100000
func: convert_by_python, timeit: 0.683977
func: convert_by_cpp_re, timeit: 0.151032
func: convert_by_cpp_bit, timeit: 0.066398

Gauss算法

除了调用 mktime() 接口来计算时间戳,那有没有更快速的方法呢?在 Linux 的源码中可以找到一个更精妙的算法:

/*
 * mktime64 - Converts date to seconds.
 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
 *
 * [For the Julian calendar (which was used in Russia before 1917,
 * Britain & colonies before 1752, anywhere else before 1582,
 * and is still in use by some communities) leave out the
 * -year/100+year/400 terms, and add 10.]
 *
 * This algorithm was first published by Gauss (I think).
 *
 * A leap second can be indicated by calling this function with sec as
 * 60 (allowable under ISO 8601).  The leap second is treated the same
 * as the following second since they don't exist in UNIX time.
 *
 * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
 * tomorrow - (allowable under ISO 8601) is supported.
 */
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
		const unsigned int day, const unsigned int hour,
		const unsigned int min, const unsigned int sec)
{
	unsigned int mon = mon0, year = year0;
	/* 1..12 -> 11,12,1..10 */
	if (0 >= (int) (mon -= 2)) {
		mon += 12;	/* Puts Feb last since it has leap day */
		year -= 1;
	}
	return ((((time64_t)
		  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
		  year*365 - 719499
	    )*24 + hour /* now have hours - midnight tomorrow handled here */
	  )*60 + min /* now have minutes */
	)*60 + sec; /* finally seconds */
}

短短几行代码就能搞定,只能感叹大佬NB!

使用新的算法将 mktime 替换掉,由于不再使用系统的时区,我们需要在接口提供时区设置参数:

static PyObject *str2ts_gauss(PyObject *self, PyObject *args, PyObject *kwargs) {
    const char *s;
    int offset = 0;
    static char *kwlist[] = {"s", "offset", nullptr};
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|i", kwlist, &s, &offset)) {
        return nullptr;
    }
    try {
        std::string ss(s);
        int year = std::stoi(ss.substr(0, 4));
        int mon = std::stoi(ss.substr(5, 2));
        int day = std::stoi(ss.substr(8, 2));
        int hour = std::stoi(ss.substr(11, 2));
        int min = std::stoi(ss.substr(14, 2));
        int sec = std::stoi(ss.substr(17, 2));
        if (0 >= (mon -= 2)) {  /* 1..12 -> 11,12,1..10 */
            mon += 12;  /* Puts Feb last since it has leap day */
            year -= 1;
        }
        int leap = year / 4 - year / 100 + year / 400;
        long t = (((leap + day + 367 * mon / 12 + year * 365 - 719499) * 24 + hour) * 60 + min) * 60 + sec;
        PyObject *ret = PyInt_FromLong(t - offset * 3600);
        return ret;
    }
    catch (std::exception &err) {
        PyErr_SetString(PyExc_TypeError, err.what());
        return nullptr;
    }
}

static PyMethodDef FastTimeMethods[] = {
        {"str2ts_gauss", (PyCFunction)str2ts_gauss, METH_VARARGS|METH_KEYWORDS, ""},
        ...
};

由于我们是北京东八区,需要加上8个小时:

@timeit
def convert_by_cpp_gauss(samples):
    for s in samples:
        fasttime.str2ts_gauss(s, offset=8)

再次对比一下性能:

benchmark N = 100000
func: convert_by_python, timeit: 0.900021
func: convert_by_cpp_re, timeit: 0.296217
func: convert_by_cpp_bit, timeit: 0.216649
func: convert_by_cpp_gauss, timeit: 0.051596

可以看到,若没有设置时区环境变量,这个算法的耗时只要 mktime 的四分之一,如果有设置则基本持平;而相比 python 实现,效率更是提升十几倍。

总结

虽然只是一个小小的需求,不过当深入去了解的时候,却能在源码中发现很多新的知识!

参考:

码字很辛苦,转载请注明来自ChenJiehua《C++字符串日期转时间戳》

评论