pandas.DataFrame.to_excel报错:openpyxl.utils.exceptions.IllegalCharacterError

由于待处理的文件内容里含有非法字符,转换成excel的时候出错

openpyxl.utils.exceptions.IllegalCharacterError:
使用下面的函数清理下字符即可

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pandas as pd

def clean_value(value):
""" This function cleans a value to remove illegal characters """
if isinstance(value, str):
# Define a regex pattern to match illegal characters
illegal_chars_pattern = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]')
# Replace illegal characters with an empty string
cleaned_value = illegal_chars_pattern.sub('', value)
if cleaned_value != value:
print(f"Illegal characters found in '{value}'. \nCleaned value: '{cleaned_value}'")
return cleaned_value
return value

def clean_dataframe(df):
""" This function cleans all values in a DataFrame to remove illegal characters """
return df.applymap(clean_value)

data = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
# Parse each JSON object
data.append(json.loads(line))

df = pd.DataFrame(data)
df = clean_dataframe(df)
df.to_excel(...)