tool-databridge/src/pipelines/dbf_to_postgres_ctllog.py
mingsheng.li 7fcc377a27
All checks were successful
continuous-integration/drone/push Build is passing
修改导入的数据表名称bug
2025-07-28 21:58:57 +08:00

184 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
from dbfread import DBF
from itertools import islice
from datetime import datetime, timedelta
import logging
from core.database import Database
from core.utils import size_to_human_readable, calculate_file_hash
from pipelines.base_pipeline import BasePipeline
class DbfToPostgresCtllogPipeline(BasePipeline):
def __init__(self, config):
super().__init__(config)
# todo本地调试打开
# self.data_root = 'D:\disney_test'
# self.mapping_file = 'D:\disney_test\disney-mapping.xlsx'
# todo本地调试打开
self.data_root = os.getenv('DATA_PVC_MOUNT_PATH', '/data')
self.mapping_file = os.getenv('MAPPING_FILE')
self.dbf_dir = os.getenv('DBF_INPUT_DIR', os.path.join(self.data_root, 'dbf-input'))
self.db = None
def validate_config(self):
# 确保目录存在
if not os.path.exists(self.dbf_dir):
raise ValueError(f"DBF directory not found: {self.dbf_dir}")
# 如果有映射文件,验证存在
if self.mapping_file and not os.path.exists(self.mapping_file):
self.logger.warning(f"Mapping file not found: {self.mapping_file}")
self.mapping_file = None
def load_mapping(self):
"""加载映射关系"""
if not self.mapping_file:
self.logger.info("No mapping file provided, using default mapping")
return {}
try:
self.logger.info(f"Loading mapping from {self.mapping_file}")
mapping_df = pd.read_excel(self.mapping_file, sheet_name="Mapping")
# 清理数据
mapping_df = mapping_df.dropna(subset=['AS_SERIAL', 'ID', 'data_field_sequence_id'])
mapping_df = mapping_df[['AS_SERIAL', 'ID', 'data_field_sequence_id',
'control_group_controller_id', 'controller_point_id']]
# 创建映射字典
mapping_dict = {}
for _, row in mapping_df.iterrows():
key = (str(row['AS_SERIAL']), str(row['ID']))
if key not in mapping_dict:
mapping_dict[key] = []
mapping_dict[key].append({
'seq_id': int(row['data_field_sequence_id']),
'control_group_controller_id': int(row['control_group_controller_id']),
'point_id': int(row['controller_point_id'])
})
self.logger.info(f"Loaded {len(mapping_dict)} mapping entries")
return mapping_dict
except Exception as e:
self.logger.error(f"Failed to load mapping: {str(e)}")
return {}
def process(self):
# 加载映射关系
mapping_dict = self.load_mapping()
# 连接数据库
# todo本地调试时打开
db_config = self.config.get_database_config()
self.db = Database(**db_config)
# todo本地调试时打开
# 处理文件
total_processed = 0
for filename in os.listdir(self.dbf_dir):
if filename.casefold().endswith('.dbf'):
file_path = os.path.join(self.dbf_dir, filename)
processed = self.process_file(file_path, mapping_dict)
total_processed += processed
self.logger.info(f"Processed {processed} records from {filename}")
# 关闭数据库连接
# todo本地调试时打开
self.db.disconnect()
# todo本地调试时打开
return total_processed
def process_file(self, file_path, mapping_dict):
self.logger.info(f"Processing file: {os.path.basename(file_path)}")
try:
# 获取文件信息
file_size = os.path.getsize(file_path)
file_hash = calculate_file_hash(file_path)
self.logger.info(f"File info: Size={size_to_human_readable(file_size)}, Hash={file_hash}")
dbf_table = DBF(file_path, encoding='utf-8')
batch_data = []
processed_records = 0
batch_size = int(os.getenv('BATCH_SIZE', 1000))
self.logger.info(f"the DBF file: {os.path.basename(file_path)} have record: #{len(dbf_table.records)}")
# 分片读取是个大坑,不能分片
# dbf_table = DBF(file_path, load=False, encoding='utf-8')
# chunk_idx = 0
# while True:
# chunk = list(islice(dbf_table._iter_records(), 100000))
# if not chunk: # 读完了
# break
# chunk_idx += 1
# # 处理这十万行
# self.logger.info(f"Handle chunk: #{chunk_idx} of file: {os.path.basename(file_path)}")
for record in dbf_table:
try:
as_serial = str(record.get('AS_SERIAL', '')).strip()
device_id = str(record.get('ID', '')).strip()
key = (as_serial, device_id)
# 跳过没有映射的记录
if key not in mapping_dict:
continue
# 处理时间 (+8小时)
dt_str = record.get('DATETIME', '')
if not dt_str:
continue
original_time = datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S')
target_time = original_time + timedelta(hours=8)
formatted_time = target_time.strftime('%Y-%m-%d %H:%M:%S+00')
# 处理每个映射
for mapping in mapping_dict[key]:
data_field = f"DATA{mapping['seq_id']:02d}"
value = record.get(data_field)
if value is None:
continue
batch_data.append((
formatted_time,
mapping['control_group_controller_id'],
mapping['point_id'],
float(value)
))
# 批量插入
if len(batch_data) >= batch_size:
# todo本地调试先注释掉
self.db.execute_batch(
"INSERT INTO controller_log (created, control_group_controller_id, point_id, real_value) VALUES (%s, %s, %s, %s)",
batch_data
)
# todo本地调试先注释掉
processed_records += len(batch_data)
self.logger.debug(f"Processed {processed_records} records from {os.path.basename(file_path)}")
batch_data = []
except Exception as e:
self.logger.warning(f"Skipping record due to error: {str(e)}")
continue
# 插入剩余记录
if batch_data:
# todo本地调试先注释掉
self.db.execute_batch(
"INSERT INTO controller_log (created, control_group_controller_id, point_id, real_value) VALUES (%s, %s, %s, %s)",
batch_data
)
# todo本地调试先注释掉
processed_records += len(batch_data)
self.logger.debug(f"Processed {processed_records} records from {os.path.basename(file_path)}")
self.logger.info(f"Processed {processed_records} records from {os.path.basename(file_path)}")
return processed_records
except Exception as e:
self.logger.error(f"Failed to process file {file_path}: {str(e)}")
return 0