第8章字符和字符串

字符类型

基本字符类型

类型	大小（字节）	范围
char	1	-128 到 127 或 0 到 255（取决于实现）
signed char	1	-128 到 127
unsigned char	1	0 到 255
wchar_t	2 或 4	取决于实现
char16_t	2	0 到 65535（C++11+）
char32_t	4	0 到 4294967295（C++11+）
char8_t	1	0 到 255（C++20+，用于UTF-8编码）

字符常量

// 普通字符常量
char c1 = 'A';
char c2 = '\n'; // 换行符
char c3 = '\\'; // 反斜杠
char c4 = '\''; // 单引号
char c5 = '"'; // 双引号
char c6 = '\0'; // 空字符

// 宽字符常量
wchar_t wc1 = L'A';
wchar_t wc2 = L'中';

// UTF-16字符（C++11+）
char16_t c16 = u'A';

// UTF-32字符（C++11+）
char32_t c32 = U'A';

转义序列

转义序列	描述
\n	换行符
\t	制表符
\r	回车符
\	反斜杠
'	单引号
"	双引号
\0	空字符
\a	响铃符
\b	退格符
\f	换页符
\v	垂直制表符
\xhh	十六进制转义序列
\ooo	八进制转义序列

C风格字符串

字符串字面量

// 字符串字面量
const char* str1 = "Hello, world!";
const char str2[] = "Hello, world!";

// 多行字符串
const char* str3 = "Line 1\n" 
                   "Line 2\n" 
                   "Line 3";

// 原始字符串（C++11+）
const char* str4 = R"(Raw string with \n and ")";

字符串操作函数

C标准库提供了一系列字符串操作函数，声明在<cstring>头文件中：

#include <cstring>

// 字符串长度
const char* str = "Hello";
size_t length = strlen(str); // 返回5

// 字符串复制
char dest[20];
strcpy(dest, "Hello"); // 复制字符串

// 安全的字符串复制
strncpy(dest, "Hello", sizeof(dest) - 1);
dest[sizeof(dest) - 1] = '\0'; // 确保 null 终止

// 字符串连接
strcat(dest, " world"); // 连接字符串

// 安全的字符串连接
strncat(dest, " world", sizeof(dest) - strlen(dest) - 1);

// 字符串比较
int result = strcmp("Hello", "World"); // 返回负数
result = strcmp("Hello", "Hello"); // 返回0
result = strcmp("World", "Hello"); // 返回正数

// 安全的字符串比较
result = strncmp("Hello", "World", 3); // 比较前3个字符

// 字符串搜索
const char* found = strchr("Hello", 'l'); // 查找第一个 'l'
found = strrchr("Hello", 'l'); // 查找最后一个 'l'
found = strstr("Hello world", "world"); // 查找子字符串

字符串输入和输出

// 字符串输入
char name[50];
std::cout << "Enter your name: ";
std::cin >> name; // 读取到空格为止
std::cout << "Your name is: " << name << std::endl;

// 读取整行
std::cout << "Enter a line: ";
std::cin.ignore(); // 忽略之前的换行符
std::cin.getline(name, sizeof(name)); // 读取整行
std::cout << "You entered: " << name << std::endl;

// 字符串输出
std::cout << "Hello, " << name << "!" << std::endl;

string 类（C++标准库）

基本用法

#include <string>

// 字符串初始化
std::string s1; // 空字符串
std::string s2 = "Hello";
std::string s3("Hello");
std::string s4(5, 'a'); // "aaaaa"
std::string s5(s2); // 复制构造
std::string s6(s2, 1, 3); // 从索引1开始，长度3："ell"
std::string s7({ 'H', 'e', 'l', 'l', 'o' }); // 初始化列表

// 字符串赋值
s1 = "World";
s1 = s2;
s1.assign("Hello");
s1.assign("Hello", 2, 3); // 从索引2开始，长度3："llo"
s1.assign(5, 'x'); // "xxxxx"

// 字符串拼接
s1 = "Hello";
s1 += " world";
s1.append("!");
s1.append("abc", 2); // 添加前2个字符："ab"
s1.append(3, '?'); // 添加3个'?'

// 字符串访问
char c = s1[0]; // 下标访问，无边界检查
c = s1.at(0); // 带边界检查的访问

// 字符串长度
size_t len = s1.length();
len = s1.size();
bool empty = s1.empty();

// 字符串比较
int result = s1.compare(s2); // 类似于strcmp
result = s1.compare(0, 2, s2, 0, 2); // 比较子字符串

// 字符串搜索
size_t pos = s1.find('l'); // 查找第一个 'l'
pos = s1.find("world"); // 查找子字符串
pos = s1.find('l', 2); // 从索引2开始查找
pos = s1.rfind('l'); // 查找最后一个 'l'
pos = s1.find_first_of("aeiou"); // 查找第一个元音字母
pos = s1.find_last_of("aeiou"); // 查找最后一个元音字母
pos = s1.find_first_not_of("abc"); // 查找第一个不是abc的字符

// 字符串子串
std::string sub = s1.substr(1, 3); // 从索引1开始，长度3

// 字符串修改
s1.insert(5, " "); // 在索引5处插入空格
s1.erase(5, 1); // 从索引5开始删除1个字符
s1.replace(0, 5, "Hi"); // 替换子字符串

// 字符串转换
int i = std::stoi("123"); // 转换为int
long l = std::stol("123456"); // 转换为long
double d = std::stod("3.14"); // 转换为double
std::string s = std::to_string(123); // 转换为string

// 字符串交换
std::string a = "Hello", b = "World";
a.swap(b); // 交换a和b

string 类的输入和输出

// 字符串输入
std::string name;
std::cout << "Enter your name: ";
std::cin >> name; // 读取到空格为止
std::cout << "Your name is: " << name << std::endl;

// 读取整行
std::cout << "Enter a line: ";
std::cin.ignore(); // 忽略之前的换行符
std::getline(std::cin, name); // 读取整行
std::cout << "You entered: " << name << std::endl;

// 字符串输出
std::cout << "Hello, " << name << "!" << std::endl;

字符串流

字符串输入流（istringstream）

#include <sstream>

std::string data = "123 45.67 Hello";
std::istringstream iss(data);

int i;
double d;
std::string s;

iss >> i >> d >> s; // 从字符串流中读取数据
std::cout << "i: " << i << ", d: " << d << ", s: " << s << std::endl;

字符串输出流（ostringstream）

#include <sstream>

std::ostringstream oss;
int i = 123;
double d = 45.67;
std::string s = "Hello";

oss << "i: " << i << ", d: " << d << ", s: " << s;
std::string result = oss.str(); // 获取生成的字符串
std::cout << result << std::endl;

字符串流的应用

// 数字转字符串
int number = 123;
std::ostringstream oss;
oss << number;
std::string numberStr = oss.str();

// 字符串转数字
std::string numberStr = "123";
std::istringstream iss(numberStr);
int number;
iss >> number;

// 格式化输出
std::ostringstream oss;
oss << std::fixed << std::setprecision(2);
oss << "Pi is approximately " << 3.14159;
std::string message = oss.str();

宽字符串

宽字符和宽字符串

// 宽字符
wchar_t wc = L'A';

// 宽字符串字面量
const wchar_t* wstr = L"Hello, world!";

// 宽字符串输入输出
std::wcout << L"Enter your name: ";
std::wstring wname;
std::wcin >> wname;
std::wcout << L"Hello, " << wname << L"!" << std::endl;

wstring 类

// wstring 类
std::wstring ws1 = L"Hello";
std::wstring ws2(5, L'a');

// 基本操作与 string 类似
ws1 += L" world";
size_t len = ws1.length();
std::wcout << ws1 << std::endl;

Unicode 字符串

UTF-8 字符串

// UTF-8 字符串字面量（C++11+）
const char* utf8Str = u8"Hello, 世界!";

// UTF-8 字符串
std::string utf8String = u8"Hello, 世界!";

// 输出 UTF-8 字符串
std::cout << utf8String << std::endl;

UTF-16 字符串

// UTF-16 字符串字面量（C++11+）
const char16_t* utf16Str = u"Hello, 世界!";

// UTF-16 字符串
std::u16string utf16String = u"Hello, 世界!";

UTF-32 字符串

// UTF-32 字符串字面量（C++11+）
const char32_t* utf32Str = U"Hello, 世界!";

// UTF-32 字符串
std::u32string utf32String = U"Hello, 世界!";

字符串的最佳实践

1. 优先使用 std::string

安全性：std::string 自动管理内存，避免缓冲区溢出
便捷性：std::string 提供了丰富的成员函数
可读性：std::string 的代码更易读、易维护
兼容性：std::string 可以与 C 风格字符串互操作

2. 避免缓冲区溢出

// 错误：可能导致缓冲区溢出
char buffer[10];
std::cin >> buffer; // 如果输入超过9个字符，会导致缓冲区溢出

// 正确：使用 std::string
std::string buffer;
std::cin >> buffer; // 自动处理任意长度的输入

3. 字符串连接

// 低效：多次字符串连接
std::string result;
result = "Hello";
result += " ";
result += "world";
result += "!";

// 高效：使用单个表达式
std::string result = "Hello" + std::string(" ") + "world" + "!";

// 更高效：使用 ostringstream
std::ostringstream oss;
oss << "Hello" << " " << "world" << "!";
std::string result = oss.str();

4. 字符串比较

// 错误：使用 == 比较 C 风格字符串
const char* str1 = "Hello";
const char* str2 = "Hello";
if (str1 == str2) { // 比较的是指针，不是内容
    // 可能不执行
}

// 正确：使用 strcmp 比较 C 风格字符串
if (strcmp(str1, str2) == 0) {
    // 执行
}

// 正确：使用 == 比较 std::string
std::string s1 = "Hello";
std::string s2 = "Hello";
if (s1 == s2) { // 比较的是内容
    // 执行
}

5. 字符串转换

// 数字转字符串
int number = 123;

// C++11+
std::string str = std::to_string(number);

// 旧版本 C++
std::ostringstream oss;
oss << number;
std::string str = oss.str();

// 字符串转数字
std::string str = "123";

// C++11+
int number = std::stoi(str);

// 旧版本 C++
std::istringstream iss(str);
int number;
iss >> number;

C++11+字符串处理新特性

字符串视图（std::string_view，C++17+）

std::string_view是C++17引入的一个非所有权字符串视图，用于提供对字符串的高效访问，避免不必要的字符串复制：

#include <string_view>
#include <string>

// 基本使用
std::string s = "Hello, world!";
std::string_view sv(s);
std::cout << sv << std::endl; // 输出：Hello, world!

// 从C风格字符串创建
const char* cstr = "Hello";
std::string_view sv2(cstr);

// 从子字符串创建
std::string_view sv3(s, 0, 5); // 从索引0开始，长度5："Hello"

// 基本操作
std::cout << "Length: " << sv.length() << std::endl;
std::cout << "Empty: " << sv.empty() << std::endl;
std::cout << "Substring: " << sv.substr(7, 5) << std::endl; // "world"

// 查找操作
size_t pos = sv.find("world");
if (pos != std::string_view::npos) {
    std::cout << "Found 'world' at position: " << pos << std::endl;
}

// 比较操作
if (sv.starts_with("Hello")) {
    std::cout << "Starts with 'Hello'" << std::endl;
}

if (sv.ends_with("!")) {
    std::cout << "Ends with '!'" << std::endl;
}

std::string的新方法（C++11+）

C++11新方法

// 移动语义
std::string s1 = "Hello";
std::string s2 = std::move(s1); // 移动构造，s1变为空

// emplace_back
std::string s;
s.emplace_back('H');
s.append("ello");

C++14新方法

// 字符串字面量操作符
using namespace std::string_literals;
std::string s = "Hello"s; // 等同于std::string("Hello")

// 原始字符串字面量与后缀
std::string raw = R"(Raw string with "quotes" and \backslashes)"s;

C++20新方法

// starts_with和ends_with
std::string s = "Hello, world!";
if (s.starts_with("Hello")) {
    std::cout << "Starts with 'Hello'" << std::endl;
}

if (s.ends_with("!")) {
    std::cout << "Ends with '!'" << std::endl;
}

// 支持多种参数类型
if (s.starts_with({'H', 'e'})) {
    std::cout << "Starts with 'He'" << std::endl;
}

// 范围构造
std::vector<char> chars = {'H', 'e', 'l', 'l', 'o'};
std::string s2(chars.begin(), chars.end());

C++23新方法

// contains
std::string s = "Hello, world!";
if (s.contains("world")) {
    std::cout << "Contains 'world'" << std::endl;
}

if (s.contains('o')) {
    std::cout << "Contains 'o'" << std::endl;
}

// resize_and_overwrite
std::string s;
s.resize_and_overwrite(10, [](char* buffer, size_t size) -> size_t {
    std::memcpy(buffer, "Hello", 5);
    return 5; // 返回实际写入的字符数
});
std::cout << s << std::endl; // 输出：Hello

正则表达式（C++11+）

C++11引入了std::regex库，用于字符串的模式匹配和替换：

#include <regex>
#include <string>

// 基本匹配
std::string s = "Hello, world!";
std::regex pattern("world");
if (std::regex_search(s, pattern)) {
    std::cout << "Found 'world'" << std::endl;
}

// 捕获组
std::string date = "2023-12-25";
std::regex datePattern(R"((\d{4})-(\d{2})-(\d{2}))");
std::smatch matches;
if (std::regex_search(date, matches, datePattern)) {
    std::cout << "Year: " << matches[1] << std::endl;
    std::cout << "Month: " << matches[2] << std::endl;
    std::cout << "Day: " << matches[3] << std::endl;
}

// 替换
std::string text = "Hello, world! Hello, C++!";
std::regex replacePattern("Hello");
std::string result = std::regex_replace(text, replacePattern, "Hi");
std::cout << result << std::endl; // 输出：Hi, world! Hi, C++!

// 正则表达式标志
std::regex caseInsensitivePattern("hello", std::regex::icase);
if (std::regex_search(s, caseInsensitivePattern)) {
    std::cout << "Found 'hello' (case insensitive)" << std::endl;
}

C++20新特性：format库

C++20引入了std::format库，提供了一种类型安全、灵活的字符串格式化方法：

#include <format>
#include <string>

// 基本格式化
std::string message = std::format("Hello, {}!", "world");
std::cout << message << std::endl; // 输出：Hello, world!

// 多个参数
std::string info = std::format("Name: {}, Age: {}", "Alice", 30);
std::cout << info << std::endl; // 输出：Name: Alice, Age: 30

// 格式化数字
std::string number = std::format("Pi is approximately {:.2f}", 3.14159);
std::cout << number << std::endl; // 输出：Pi is approximately 3.14

// 格式化宽度和对齐
std::string aligned = std::format("{:<10} {:>10}", "Left", "Right");
std::cout << aligned << std::endl; // 输出：Left              Right

// 格式化进制
std::string hex = std::format("Decimal: {}, Hex: {:x}, Octal: {:o}", 42, 42, 42);
std::cout << hex << std::endl; // 输出：Decimal: 42, Hex: 2a, Octal: 52

format库的优点

类型安全：相比printf，std::format是类型安全的
灵活性：支持位置参数和命名参数
可读性：格式化字符串更清晰易读
性能：性能与printf相当或更好
扩展性：支持自定义类型的格式化

C++23新特性：print库

C++23引入了std::print和std::println函数，提供了一种更方便的字符串输出方法：

#include <print>

// 基本输出
std::print("Hello, {}}!", "world");

// 带换行
std::println("Hello, {}!", "world");

// 多个参数
std::println("Name: {}, Age: {}", "Alice", 30);

// 格式化数字
std::println("Pi is approximately {:.2f}", 3.14159);

Unicode字符串处理进阶

Unicode码点和代码单元

// Unicode码点是字符的数字表示
// UTF-8使用1-4个代码单元（字节）表示一个码点
// UTF-16使用1-2个代码单元（16位）表示一个码点
// UTF-32使用1个代码单元（32位）表示一个码点

// 遍历UTF-8字符串的码点
#include <cuchar>
#include <string>

void printUtf8CodePoints(const std::string& utf8Str) {
    const char* p = utf8Str.data();
    const char* end = p + utf8Str.size();
    
    while (p < end) {
        char32_t codePoint;
        size_t len = mbrtoc32(&codePoint, p, end - p, nullptr);
        if (len == static_cast<size_t>(-1) || len == static_cast<size_t>(-2)) {
            break; // 无效的UTF-8序列
        }
        std::cout << "Code point: U+" << std::hex << codePoint << std::endl;
        p += len;
    }
}

// 使用
std::string utf8Str = u8"Hello, 世界!";
printUtf8CodePoints(utf8Str);

Unicode字符串的转换

// UTF-8与UTF-16之间的转换
#include <codecvt>
#include <locale>

// UTF-8到UTF-16
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
std::string utf8Str = u8"Hello, 世界!";
std::wstring utf16Str = converter.from_bytes(utf8Str);

// UTF-16到UTF-8
std::string utf8Str2 = converter.to_bytes(utf16Str);

// 注意：C++17已弃用std::wstring_convert，推荐使用第三方库如ICU或Boost.Locale

常见错误和陷阱

1. 空指针解引用

// 错误：空指针解引用
const char* str = nullptr;
size_t len = strlen(str); // 未定义行为

// 正确：检查空指针
if (str != nullptr) {
    size_t len = strlen(str);
}

2. 缓冲区溢出

// 错误：缓冲区溢出
char buffer[10];
strcpy(buffer, "This string is too long"); // 缓冲区溢出

// 正确：使用安全的函数
strncpy(buffer, "This string is too long", sizeof(buffer) - 1);
buffer[sizeof(buffer) - 1] = '\0'; // 确保 null 终止

// 更好：使用 std::string
std::string buffer = "This string is too long";

3. 忘记 null 终止符

// 错误：忘记 null 终止符
char buffer[10];
for (int i = 0; i < 10; i++) {
    buffer[i] = 'a';
}
std::cout << buffer << std::endl; // 未定义行为，缺少 null 终止符

// 正确：添加 null 终止符
char buffer[11]; // 多留一个位置
for (int i = 0; i < 10; i++) {
    buffer[i] = 'a';
}
buffer[10] = '\0'; // 添加 null 终止符
std::cout << buffer << std::endl;

4. 字符串字面量的修改

// 错误：修改字符串字面量
char* str = "Hello";
str[0] = 'h'; // 未定义行为，字符串字面量是常量

// 正确：使用可修改的数组
char str[] = "Hello";
str[0] = 'h'; // 合法

5. 混合使用 C 风格字符串和 std::string

// 潜在问题：混合使用
std::string s = "Hello";
const char* cstr = s.c_str();
// 不要在 s 被修改后使用 cstr，因为 cstr 可能失效

// 安全使用
std::string s = "Hello";
std::string copy = s; // 创建副本
const char* cstr = copy.c_str(); // 使用副本的 c_str()

小结

本章介绍了C++中的字符和字符串处理，包括：

字符类型：char、wchar_t、char16_t、char32_t
C风格字符串：字符数组、字符串字面量、字符串操作函数
std::string 类：C++标准库提供的字符串类，具有丰富的成员函数
字符串流：istringstream 和 ostringstream，用于字符串的输入输出
宽字符串：wchar_t 和 std::wstring
Unicode 字符串：UTF-8、UTF-16、UTF-32 字符串
字符串的最佳实践：优先使用 std::string，避免缓冲区溢出等
常见错误和陷阱：空指针解引用、缓冲区溢出、忘记 null 终止符等

字符串是C++程序中最常用的数据类型之一，掌握好字符串的处理方法对于编写高效、可靠的程序至关重要。在实际编程中，应优先使用 std::string 类，它提供了更安全、更便捷的字符串操作方式。同时，也要了解 C 风格字符串的基本概念和操作函数，因为在一些遗留代码或与 C 库交互的场景中仍然会用到。

在后续章节中，我们将学习更高级的C++特性，如内存模型、面向对象编程、模板等，这些特性将与字符串处理结合使用，帮助我们构建更复杂、更强大的程序。

第8章 字符和字符串