This commit is contained in:
parent
4931890ba2
commit
8b0bb6e753
|
|
@ -96,9 +96,19 @@ class DbfToPostgresCtllogPipeline(BasePipeline):
|
||||||
|
|
||||||
# 处理文件
|
# 处理文件
|
||||||
total_processed = 0
|
total_processed = 0
|
||||||
|
|
||||||
|
file_list = []
|
||||||
for filename in os.listdir(self.dbf_dir):
|
for filename in os.listdir(self.dbf_dir):
|
||||||
if filename.casefold().endswith('.dbf'):
|
if filename.casefold().endswith('.dbf'):
|
||||||
file_path = os.path.join(self.dbf_dir, filename)
|
file_path = os.path.join(self.dbf_dir, filename)
|
||||||
|
# 获取最后修改时间
|
||||||
|
mtime = os.path.getmtime(file_path)
|
||||||
|
file_list.append((mtime, file_path, filename))
|
||||||
|
|
||||||
|
# 按最后修改时间排序(从早到晚)
|
||||||
|
file_list.sort(key=lambda x: x[0])
|
||||||
|
# 按顺序处理文件
|
||||||
|
for mtime, file_path, filename in file_list:
|
||||||
processed = self.process_file(file_path, mapping_dict)
|
processed = self.process_file(file_path, mapping_dict)
|
||||||
total_processed += processed
|
total_processed += processed
|
||||||
self.logger.info(f"Processed {processed} records from {filename}")
|
self.logger.info(f"Processed {processed} records from {filename}")
|
||||||
|
|
@ -265,10 +275,18 @@ class DbfToPostgresCtllogPipeline(BasePipeline):
|
||||||
return
|
return
|
||||||
|
|
||||||
# 实际插入数据库
|
# 实际插入数据库
|
||||||
|
try:
|
||||||
self.db.execute_batch(
|
self.db.execute_batch(
|
||||||
"INSERT INTO controller_log (created, control_group_controller_id, point_id, real_value) VALUES (%s, %s, %s, %s)",
|
"""
|
||||||
|
INSERT INTO controller_log (created, control_group_controller_id, point_id, real_value)
|
||||||
|
VALUES (%s, %s, %s, %s)
|
||||||
|
ON CONFLICT (created, control_group_controller_id, point_id)
|
||||||
|
DO UPDATE SET real_value = EXCLUDED.real_value
|
||||||
|
""",
|
||||||
[(data[0], data[1], data[2], data[3]) for data in self.batch_data]
|
[(data[0], data[1], data[2], data[3]) for data in self.batch_data]
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Batch Insert data failed: {e}")
|
||||||
|
|
||||||
# todo: debug时写入CSV(调试用)
|
# todo: debug时写入CSV(调试用)
|
||||||
# with open(self.csv_file_path, "a", newline="") as f:
|
# with open(self.csv_file_path, "a", newline="") as f:
|
||||||
|
|
|
||||||
|
|
@ -12,10 +12,10 @@ from pipelines.base_pipeline import BasePipeline
|
||||||
class DbfToPostgresCtllogPwcPipeline(BasePipeline):
|
class DbfToPostgresCtllogPwcPipeline(BasePipeline):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
# todo:本地调试打开
|
# todo:debug use
|
||||||
# self.data_root = 'D:\disney_test'
|
# self.data_root = 'D:\disney_test'
|
||||||
# self.mapping_file = 'D:\disney_test\disney-mapping-elec-v3.xlsx'
|
# self.mapping_file = 'D:\disney_test\disney-mapping-elec-v3.xlsx'
|
||||||
# todo:本地调试打开
|
# todo:debug use
|
||||||
self.data_root = os.getenv('DATA_PVC_MOUNT_PATH', '/data')
|
self.data_root = os.getenv('DATA_PVC_MOUNT_PATH', '/data')
|
||||||
self.mapping_file = os.getenv('MAPPING_FILE')
|
self.mapping_file = os.getenv('MAPPING_FILE')
|
||||||
self.dbf_dir = os.getenv('DBF_INPUT_DIR', os.path.join(self.data_root, 'dbf-input'))
|
self.dbf_dir = os.getenv('DBF_INPUT_DIR', os.path.join(self.data_root, 'dbf-input'))
|
||||||
|
|
@ -94,9 +94,19 @@ class DbfToPostgresCtllogPwcPipeline(BasePipeline):
|
||||||
|
|
||||||
# 处理文件
|
# 处理文件
|
||||||
total_processed = 0
|
total_processed = 0
|
||||||
|
|
||||||
|
file_list = []
|
||||||
for filename in os.listdir(self.dbf_dir):
|
for filename in os.listdir(self.dbf_dir):
|
||||||
if filename.casefold().endswith('.dbf'):
|
if filename.casefold().endswith('.dbf'):
|
||||||
file_path = os.path.join(self.dbf_dir, filename)
|
file_path = os.path.join(self.dbf_dir, filename)
|
||||||
|
# 获取最后修改时间
|
||||||
|
mtime = os.path.getmtime(file_path)
|
||||||
|
file_list.append((mtime, file_path, filename))
|
||||||
|
|
||||||
|
# 按最后修改时间排序(从早到晚)
|
||||||
|
file_list.sort(key=lambda x: x[0])
|
||||||
|
# 按顺序处理文件
|
||||||
|
for mtime, file_path, filename in file_list:
|
||||||
processed = self.process_file(file_path, mapping_dict)
|
processed = self.process_file(file_path, mapping_dict)
|
||||||
total_processed += processed
|
total_processed += processed
|
||||||
self.logger.info(f"Processed {processed} records from {filename}")
|
self.logger.info(f"Processed {processed} records from {filename}")
|
||||||
|
|
@ -177,10 +187,18 @@ class DbfToPostgresCtllogPwcPipeline(BasePipeline):
|
||||||
return
|
return
|
||||||
|
|
||||||
# 实际插入数据库
|
# 实际插入数据库
|
||||||
|
try:
|
||||||
self.db.execute_batch(
|
self.db.execute_batch(
|
||||||
"INSERT INTO controller_log (created, control_group_controller_id, point_id, real_value) VALUES (%s, %s, %s, %s)",
|
"""
|
||||||
|
INSERT INTO controller_log (created, control_group_controller_id, point_id, real_value)
|
||||||
|
VALUES (%s, %s, %s, %s)
|
||||||
|
ON CONFLICT (created, control_group_controller_id, point_id)
|
||||||
|
DO UPDATE SET real_value = EXCLUDED.real_value
|
||||||
|
""",
|
||||||
[(data[0], data[1], data[2], data[3]) for data in self.batch_data]
|
[(data[0], data[1], data[2], data[3]) for data in self.batch_data]
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Batch Insert data failed: {e}")
|
||||||
|
|
||||||
# todo: debug时写入CSV(调试用)
|
# todo: debug时写入CSV(调试用)
|
||||||
# with open(self.csv_file_path, "a", newline="") as f:
|
# with open(self.csv_file_path, "a", newline="") as f:
|
||||||
|
|
|
||||||
317
src/pipelines/log_to_postgres_ctllog.py
Normal file
317
src/pipelines/log_to_postgres_ctllog.py
Normal file
|
|
@ -0,0 +1,317 @@
|
||||||
|
import os
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
from dbfread import DBF
|
||||||
|
from itertools import islice
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import logging
|
||||||
|
from core.database import Database
|
||||||
|
from core.utils import size_to_human_readable, calculate_file_hash
|
||||||
|
from pipelines.base_pipeline import BasePipeline
|
||||||
|
|
||||||
|
|
||||||
|
class LogToPostgresCtllogPipeline(BasePipeline):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
# todo:本地调试打开
|
||||||
|
self.data_root = 'D:\disney_test'
|
||||||
|
self.mapping_file = 'D:\disney_test\disney-mapping-v2.xlsx'
|
||||||
|
# todo:本地调试打开
|
||||||
|
# self.data_root = os.getenv('DATA_PVC_MOUNT_PATH', '/data')
|
||||||
|
# self.mapping_file = os.getenv('MAPPING_FILE')
|
||||||
|
self.log_dir = os.getenv('DBF_INPUT_DIR', os.path.join(self.data_root, 'dbf-input'))
|
||||||
|
|
||||||
|
# todo:debug use
|
||||||
|
self.csv_file_path = 'D:\disney_test\debug_controller_log_logformat.csv'
|
||||||
|
# 初始化CSV文件
|
||||||
|
if not os.path.exists(self.csv_file_path):
|
||||||
|
with open(self.csv_file_path, 'w') as f:
|
||||||
|
csv.writer(f).writerow(
|
||||||
|
['created', 'control_group_controller_id', 'point_id', 'real_value'])
|
||||||
|
# todo:debug use
|
||||||
|
|
||||||
|
self.db = None
|
||||||
|
self.group_cache = {}
|
||||||
|
self.batch_size = int(os.getenv('BATCH_SIZE', 1000))
|
||||||
|
self.batch_data = []
|
||||||
|
self.seen = set() # 用于去重
|
||||||
|
self.processed_records = 0
|
||||||
|
self.current_date = None # 当前处理的日期
|
||||||
|
|
||||||
|
def validate_config(self):
|
||||||
|
# 确保目录存在
|
||||||
|
if not os.path.exists(self.log_dir):
|
||||||
|
raise ValueError(f"DBF directory not found: {self.log_dir}")
|
||||||
|
|
||||||
|
# 如果有映射文件,验证存在
|
||||||
|
if self.mapping_file and not os.path.exists(self.mapping_file):
|
||||||
|
self.logger.warning(f"Mapping file not found: {self.mapping_file}")
|
||||||
|
self.mapping_file = None
|
||||||
|
|
||||||
|
def load_mapping(self):
|
||||||
|
"""加载映射关系"""
|
||||||
|
if not self.mapping_file:
|
||||||
|
self.logger.info("No mapping file provided, using default mapping")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.logger.info(f"Loading mapping from {self.mapping_file}")
|
||||||
|
mapping_df = pd.read_excel(self.mapping_file, sheet_name="Mapping")
|
||||||
|
|
||||||
|
# 清理数据
|
||||||
|
mapping_df = mapping_df.dropna(subset=['AS_SERIAL', 'ID', 'data_field_sequence_id'])
|
||||||
|
mapping_df = mapping_df[['AS_SERIAL', 'ID', 'data_field_sequence_id', 'disney_device_point_name',
|
||||||
|
'control_group_controller_id', 'controller_point_id',
|
||||||
|
'control_group_name', 'control_group_id']]
|
||||||
|
|
||||||
|
# 创建映射字典
|
||||||
|
mapping_dict = {}
|
||||||
|
for _, row in mapping_df.iterrows():
|
||||||
|
key = (str(row['AS_SERIAL']), str(row['ID']))
|
||||||
|
if key not in mapping_dict:
|
||||||
|
mapping_dict[key] = []
|
||||||
|
|
||||||
|
mapping_dict[key].append({
|
||||||
|
'seq_id': int(row['data_field_sequence_id']),
|
||||||
|
'disney_device_point_name': row['disney_device_point_name'].strip(),
|
||||||
|
'control_group_controller_id': int(row['control_group_controller_id']),
|
||||||
|
'point_id': int(row['controller_point_id']),
|
||||||
|
'control_group_name': row['control_group_name'].strip(),
|
||||||
|
'control_group_id': int(row['control_group_id'])
|
||||||
|
})
|
||||||
|
|
||||||
|
self.logger.info(f"Loaded {len(mapping_dict)} mapping entries")
|
||||||
|
return mapping_dict
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to load mapping: {str(e)}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def process(self):
|
||||||
|
# 加载映射关系
|
||||||
|
mapping_dict = self.load_mapping()
|
||||||
|
|
||||||
|
# 连接数据库
|
||||||
|
# todo:本地调试时关闭
|
||||||
|
# db_config = self.config.get_database_config()
|
||||||
|
# self.db = Database(**db_config)
|
||||||
|
# todo:本地调试时关闭
|
||||||
|
|
||||||
|
# 处理文件
|
||||||
|
total_processed = 0
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(self.log_dir):
|
||||||
|
for filename in files:
|
||||||
|
if filename.casefold().endswith('.log'):
|
||||||
|
full_path = os.path.join(root, filename)
|
||||||
|
# 获取文件名(不含后缀),文件名即为Disney设备ID:mb-xxx。即对应mapping里的ID字段
|
||||||
|
device_id = os.path.splitext(filename)[0].upper()
|
||||||
|
# 获取文件所在目录的名称(直接父目录),目录名是Dsiney楼栋的mac地址,即对应mapping里的AS_SERIAL
|
||||||
|
as_serial = os.path.basename(root)
|
||||||
|
processed = self.process_log_file(full_path, as_serial, device_id, mapping_dict)
|
||||||
|
total_processed += processed
|
||||||
|
self.logger.info(f"Processed {processed} records from {filename}")
|
||||||
|
|
||||||
|
# 关闭数据库连接
|
||||||
|
# todo:本地调试时关闭
|
||||||
|
# self.db.disconnect()
|
||||||
|
# todo:本地调试时关闭
|
||||||
|
return total_processed
|
||||||
|
|
||||||
|
def clean_header(self, header):
|
||||||
|
"""更健壮的列名清理"""
|
||||||
|
# 先移除可能存在的引号
|
||||||
|
header = header.strip().strip("'").strip('"')
|
||||||
|
# 移除括号及内容
|
||||||
|
return re.sub(r'\s*\([^)]*\)', '', header).strip()
|
||||||
|
|
||||||
|
def clean_header_debug(self, headers):
|
||||||
|
"""带调试信息的列名清理"""
|
||||||
|
cleaned_headers = []
|
||||||
|
print(f"开始处理CSV列名,共 {len(headers)} 列")
|
||||||
|
print(f"原始列名: {headers}")
|
||||||
|
|
||||||
|
for i, original_header in enumerate(headers):
|
||||||
|
try:
|
||||||
|
print(f"\n处理第 {i + 1}/{len(headers)} 列: '{original_header}'")
|
||||||
|
|
||||||
|
# 类型检查
|
||||||
|
if not isinstance(original_header, str):
|
||||||
|
print(f"警告: 列名不是字符串类型,类型为 {type(original_header)}")
|
||||||
|
header_str = str(original_header)
|
||||||
|
print(f"转换为字符串: '{header_str}'")
|
||||||
|
else:
|
||||||
|
header_str = original_header
|
||||||
|
|
||||||
|
# 长度信息
|
||||||
|
orig_length = len(header_str)
|
||||||
|
print(f"原始长度: {orig_length} 字符")
|
||||||
|
|
||||||
|
# 移除引号
|
||||||
|
stripped = header_str.strip().strip("'").strip('"')
|
||||||
|
stripped_length = len(stripped)
|
||||||
|
print(f"移除引号后: '{stripped}' ({stripped_length} 字符)")
|
||||||
|
|
||||||
|
# 移除括号内容
|
||||||
|
cleaned = re.sub(r'\s*\([^)]*\)', '', stripped).strip()
|
||||||
|
cleaned_length = len(cleaned)
|
||||||
|
print(f"移除括号内容后: '{cleaned}' ({cleaned_length} 字符)")
|
||||||
|
|
||||||
|
# 空值检查
|
||||||
|
if not cleaned:
|
||||||
|
print(f"警告: 清理后列名为空,使用原始值")
|
||||||
|
cleaned = f"Column_{i + 1}"
|
||||||
|
|
||||||
|
cleaned_headers.append(cleaned)
|
||||||
|
print(f"处理完成: '{original_header}' => '{cleaned}'")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n错误! 处理列名失败: '{original_header}'")
|
||||||
|
print(f"错误详情: {str(e)}")
|
||||||
|
print(f"使用原始列名作为后备")
|
||||||
|
|
||||||
|
if original_header:
|
||||||
|
cleaned_headers.append(original_header)
|
||||||
|
else:
|
||||||
|
cleaned_headers.append(f"Column_{i + 1}")
|
||||||
|
|
||||||
|
print(f"\n列名处理完成")
|
||||||
|
print(f"清理后列名: {cleaned_headers}")
|
||||||
|
return cleaned_headers
|
||||||
|
|
||||||
|
def process_log_file(self, file_path, as_serial, device_id, mapping_dict):
|
||||||
|
log_table = []
|
||||||
|
self.logger.info(f"Processing file: {os.path.basename(file_path)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取文件信息
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
file_hash = calculate_file_hash(file_path)
|
||||||
|
self.logger.info(f"File info: Size={size_to_human_readable(file_size)}, Hash={file_hash}")
|
||||||
|
with open(file_path, 'r', newline='', encoding='utf-8-sig') as file:
|
||||||
|
# 创建CSV读取器
|
||||||
|
csv_reader = csv.reader(file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 读取列名行
|
||||||
|
headers = next(csv_reader)
|
||||||
|
except StopIteration:
|
||||||
|
self.logger.info("CSV is empty")
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
# 使用带调试的列名清理
|
||||||
|
cleaned_headers = self.clean_header_debug(headers)
|
||||||
|
|
||||||
|
# 创建DictReader
|
||||||
|
dict_reader = csv.DictReader(
|
||||||
|
file,
|
||||||
|
fieldnames=cleaned_headers
|
||||||
|
)
|
||||||
|
for row in dict_reader:
|
||||||
|
log_table.append(row)
|
||||||
|
|
||||||
|
# 处理log表
|
||||||
|
self.logger.info(f"the Log file: {os.path.basename(file_path)} have record: #{len(log_table)}")
|
||||||
|
for record in log_table:
|
||||||
|
self.process_record(record, as_serial, device_id, mapping_dict)
|
||||||
|
|
||||||
|
# 确保所有剩余数据都被处理
|
||||||
|
self.final_flush()
|
||||||
|
|
||||||
|
self.logger.info(f"Processed {self.processed_records} records from {os.path.basename(file_path)}")
|
||||||
|
return self.processed_records
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to process file {file_path}: {str(e)}")
|
||||||
|
return
|
||||||
|
|
||||||
|
def process_record(self, record, as_serial, device_id, mapping_dict):
|
||||||
|
"""处理单个记录"""
|
||||||
|
try:
|
||||||
|
# 从文件名得到modbus设备名
|
||||||
|
# 从文件名的上一层文件夹名得到AS_SERIAL
|
||||||
|
key = (as_serial, device_id)
|
||||||
|
|
||||||
|
if key not in mapping_dict:
|
||||||
|
return
|
||||||
|
|
||||||
|
dt_str = record.get('time', '')
|
||||||
|
if not dt_str:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 去掉两端的单引号
|
||||||
|
clean_dt_str = dt_str.strip().strip("'").strip('"')
|
||||||
|
record_time = datetime.strptime(clean_dt_str, '%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
# 检查日期变化
|
||||||
|
record_date = record_time.date()
|
||||||
|
if self.current_date is None:
|
||||||
|
self.current_date = record_date
|
||||||
|
elif record_date != self.current_date:
|
||||||
|
# 日期变化时,处理前一天的缓存数据
|
||||||
|
self.process_day_cache()
|
||||||
|
self.current_date = record_date
|
||||||
|
|
||||||
|
hour_key = record_time.replace(minute=0, second=0, microsecond=0)
|
||||||
|
|
||||||
|
# seq_id不再有作用,需要根据disney_device_point_name去记录里查找
|
||||||
|
for mapping in mapping_dict[key]:
|
||||||
|
data_field = mapping['disney_device_point_name']
|
||||||
|
# data_field = f"DATA{mapping['seq_id']:02d}"
|
||||||
|
value = record.get(data_field)
|
||||||
|
if value is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
float_value = float(value)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# todo 改造一下,log格式的数据质量比较好,不需要这么复杂的处理,可以直接写入,按照pwc的处理即可
|
||||||
|
# 创建分组键
|
||||||
|
group_key = (
|
||||||
|
mapping['control_group_controller_id'],
|
||||||
|
mapping['point_id'],
|
||||||
|
hour_key
|
||||||
|
)
|
||||||
|
|
||||||
|
# 添加到分组缓存
|
||||||
|
self.add_to_group_cache(group_key, record_time, float_value, mapping)
|
||||||
|
|
||||||
|
# 检查批次大小
|
||||||
|
if len(self.batch_data) >= self.batch_size:
|
||||||
|
self.flush_batch()
|
||||||
|
# todo 改造一下,log格式的数据质量比较好,不需要这么复杂的处理,可以直接写入,按照pwc的处理即可
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error processing record: {e}")
|
||||||
|
|
||||||
|
def flush_batch(self):
|
||||||
|
"""执行批量插入并清空批次数据"""
|
||||||
|
if not self.batch_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 实际插入数据库
|
||||||
|
# self.db.execute_batch(
|
||||||
|
# "INSERT INTO controller_log (created, control_group_controller_id, point_id, real_value) VALUES (%s, %s, %s, %s)",
|
||||||
|
# [(data[0], data[1], data[2], data[3]) for data in self.batch_data]
|
||||||
|
# )
|
||||||
|
|
||||||
|
# todo: debug时写入CSV(调试用)
|
||||||
|
with open(self.csv_file_path, "a", newline="") as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerows(self.batch_data)
|
||||||
|
# todo: debug时写入CSV
|
||||||
|
|
||||||
|
# 更新处理记录数
|
||||||
|
processed_count = len(self.batch_data)
|
||||||
|
self.processed_records += processed_count
|
||||||
|
self.logger.info(f"Inserted {processed_count} records, total {self.processed_records}")
|
||||||
|
|
||||||
|
# 清空批次数据
|
||||||
|
self.batch_data = []
|
||||||
|
self.seen.clear()
|
||||||
|
|
||||||
|
def final_flush(self):
|
||||||
|
# 刷新剩余的批次数据
|
||||||
|
self.flush_batch()
|
||||||
Loading…
Reference in New Issue
Block a user